mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 08:52:56 +00:00
Merge pull request #8217 from neondatabase/rc/2024-07-01
Storage & Compute release 2024-07-01
This commit is contained in:
@@ -183,7 +183,7 @@ runs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
|
||||
2
.github/actions/download/action.yml
vendored
2
.github/actions/download/action.yml
vendored
@@ -26,7 +26,7 @@ runs:
|
||||
TARGET: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
|
||||
SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
@@ -56,14 +56,14 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Download Neon binaries for the previous release
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon-previous
|
||||
prefix: latest
|
||||
|
||||
@@ -89,7 +89,7 @@ runs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
4
.github/actions/upload/action.yml
vendored
4
.github/actions/upload/action.yml
vendored
@@ -8,7 +8,7 @@ inputs:
|
||||
description: "A directory or file to upload"
|
||||
required: true
|
||||
prefix:
|
||||
description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
required: false
|
||||
|
||||
runs:
|
||||
@@ -45,7 +45,7 @@ runs:
|
||||
env:
|
||||
SOURCE: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
12
.github/workflows/benchmarking.yml
vendored
12
.github/workflows/benchmarking.yml
vendored
@@ -77,7 +77,7 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
@@ -235,7 +235,7 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
@@ -373,7 +373,7 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
@@ -473,7 +473,7 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
@@ -576,7 +576,7 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
@@ -677,7 +677,7 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.build-tools
|
||||
cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
|
||||
29
.github/workflows/build_and_test.yml
vendored
29
.github/workflows/build_and_test.yml
vendored
@@ -109,7 +109,7 @@ jobs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
@@ -149,7 +149,7 @@ jobs:
|
||||
# !~/.cargo/registry/src
|
||||
# ~/.cargo/git/
|
||||
# target/
|
||||
# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
# key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
@@ -291,29 +291,29 @@ jobs:
|
||||
# target/
|
||||
# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
# key: |
|
||||
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
||||
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
@@ -411,7 +411,7 @@ jobs:
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
@@ -490,7 +490,7 @@ jobs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
@@ -639,7 +639,7 @@ jobs:
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Get coverage artifact
|
||||
@@ -763,7 +763,7 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -855,7 +855,7 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -875,7 +875,7 @@ jobs:
|
||||
file: Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
|
||||
@@ -1245,6 +1245,7 @@ jobs:
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
@@ -1339,7 +1340,7 @@ jobs:
|
||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||
for build_type in debug release; do
|
||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
|
||||
4
.github/workflows/pg_clients.yml
vendored
4
.github/workflows/pg_clients.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
retention-days: 7
|
||||
name: python-test-pg_clients-${{ runner.os }}-stage-logs
|
||||
name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
|
||||
path: ${{ env.TEST_OUTPUT }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
|
||||
# * `-D clippy::todo` - don't let `todo!()` slip into `main`
|
||||
export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo"
|
||||
|
||||
111
Cargo.lock
generated
111
Cargo.lock
generated
@@ -1246,7 +1246,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -1362,8 +1362,8 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml",
|
||||
"toml_edit",
|
||||
"toml 0.7.4",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
@@ -1669,9 +1669,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel"
|
||||
version = "2.1.4"
|
||||
version = "2.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
|
||||
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
@@ -1684,11 +1684,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel_derives"
|
||||
version = "2.1.2"
|
||||
version = "2.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
|
||||
checksum = "59de76a222c2b8059f789cbe07afbfd8deb8c31dd0bc2a21f85e256c1def8259"
|
||||
dependencies = [
|
||||
"diesel_table_macro_syntax",
|
||||
"dsl_auto_type",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
@@ -1696,9 +1697,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel_migrations"
|
||||
version = "2.1.0"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
|
||||
checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6"
|
||||
dependencies = [
|
||||
"diesel",
|
||||
"migrations_internals",
|
||||
@@ -1707,9 +1708,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel_table_macro_syntax"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
|
||||
checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
|
||||
dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
@@ -1745,6 +1746,20 @@ dependencies = [
|
||||
"const-random",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dsl_auto_type"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dyn-clone"
|
||||
version = "1.0.14"
|
||||
@@ -3084,19 +3099,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "migrations_internals"
|
||||
version = "2.1.0"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
|
||||
checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"toml",
|
||||
"toml 0.8.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "migrations_macros"
|
||||
version = "2.1.0"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
|
||||
checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd"
|
||||
dependencies = [
|
||||
"migrations_internals",
|
||||
"proc-macro2",
|
||||
@@ -3576,7 +3591,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -3659,7 +3674,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"twox-hash",
|
||||
"url",
|
||||
@@ -4005,7 +4020,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4018,7 +4033,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -4037,7 +4052,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4665,7 +4680,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -5164,7 +5179,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
@@ -5443,9 +5458,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "0.6.2"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d"
|
||||
checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -6210,7 +6225,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6330,14 +6345,26 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.8.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.22.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.6.2"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f"
|
||||
checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -6352,7 +6379,20 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"winnow",
|
||||
"winnow 0.4.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
|
||||
dependencies = [
|
||||
"indexmap 2.0.1",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"winnow 0.6.13",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7335,6 +7375,15 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.6.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winreg"
|
||||
version = "0.50.0"
|
||||
@@ -7424,7 +7473,7 @@ dependencies = [
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"toml_datetime",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tonic",
|
||||
"tower",
|
||||
"tracing",
|
||||
|
||||
@@ -73,13 +73,6 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# PostgreSQL 14
|
||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt update \
|
||||
&& apt install -y postgresql-client-14 \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# AWS CLI
|
||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||
&& unzip -q awscliv2.zip \
|
||||
@@ -113,10 +106,10 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Compile and install the static OpenSSL library
|
||||
ENV OPENSSL_VERSION=3.2.2
|
||||
ENV OPENSSL_VERSION=1.1.1w
|
||||
ENV OPENSSL_PREFIX=/usr/local/openssl
|
||||
RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
|
||||
echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
|
||||
cd /tmp && \
|
||||
tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
|
||||
@@ -83,12 +83,6 @@ pub fn write_postgres_conf(
|
||||
ComputeMode::Replica => {
|
||||
// hot_standby is 'on' by default, but let's be explicit
|
||||
writeln!(file, "hot_standby=on")?;
|
||||
|
||||
// Inform the replica about the primary state
|
||||
// Default is 'false'
|
||||
if let Some(primary_is_running) = spec.primary_is_running {
|
||||
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,11 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
// should be handled gracefully.
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.as_str();
|
||||
let mut connstr = compute.connstr.clone();
|
||||
connstr
|
||||
.query_pairs_mut()
|
||||
.append_pair("application_name", "compute_activity_monitor");
|
||||
let connstr = connstr.as_str();
|
||||
|
||||
// During startup and configuration we connect to every Postgres database,
|
||||
// but we don't want to count this as some user activity. So wait until
|
||||
|
||||
@@ -21,10 +21,8 @@ use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
|
||||
use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
@@ -600,13 +598,9 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
Some(("import", import_match)) => {
|
||||
let tenant_id = get_tenant_id(import_match, env)?;
|
||||
let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
|
||||
let name = import_match
|
||||
.get_one::<String>("node-name")
|
||||
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||
let update_catalog = import_match
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
let branch_name = import_match
|
||||
.get_one::<String>("branch-name")
|
||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||
|
||||
// Parse base inputs
|
||||
let base_tarfile = import_match
|
||||
@@ -633,24 +627,11 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
println!("Importing timeline into pageserver ...");
|
||||
pageserver
|
||||
.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
|
||||
.await?;
|
||||
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||
|
||||
println!("Creating endpoint for imported timeline ...");
|
||||
cplane.new_endpoint(
|
||||
name,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
None,
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
!update_catalog,
|
||||
)?;
|
||||
env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?;
|
||||
println!("Done");
|
||||
}
|
||||
Some(("branch", branch_match)) => {
|
||||
@@ -865,20 +846,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
let mut safekeepers: Vec<NodeId> = Vec::new();
|
||||
for sk_id in safekeepers_str.split(',').map(str::trim) {
|
||||
let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
|
||||
anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
|
||||
})?);
|
||||
safekeepers.push(sk_id);
|
||||
}
|
||||
safekeepers
|
||||
} else {
|
||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||
};
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? {
|
||||
safekeepers
|
||||
} else {
|
||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||
};
|
||||
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
@@ -982,7 +956,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
endpoint.reconfigure(pageservers, None).await?;
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = parse_safekeepers(sub_args)?;
|
||||
endpoint.reconfigure(pageservers, None, safekeepers).await?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -1004,6 +981,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Parse --safekeepers as list of safekeeper ids.
|
||||
fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
let mut safekeepers: Vec<NodeId> = Vec::new();
|
||||
for sk_id in safekeepers_str.split(',').map(str::trim) {
|
||||
let sk_id = NodeId(
|
||||
u64::from_str(sk_id)
|
||||
.map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?,
|
||||
);
|
||||
safekeepers.push(sk_id);
|
||||
}
|
||||
Ok(Some(safekeepers))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(ep_subcommand_data) => ep_subcommand_data,
|
||||
@@ -1487,8 +1481,7 @@ fn cli() -> Command {
|
||||
.about("Import timeline from basebackup directory")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(Arg::new("node-name").long("node-name")
|
||||
.help("Name to assign to the imported timeline"))
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(Arg::new("base-tarfile")
|
||||
.long("base-tarfile")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
@@ -1504,7 +1497,6 @@ fn cli() -> Command {
|
||||
.arg(Arg::new("end-lsn").long("end-lsn")
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(update_catalog.clone())
|
||||
)
|
||||
).subcommand(
|
||||
Command::new("tenant")
|
||||
@@ -1609,7 +1601,7 @@ fn cli() -> Command {
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(endpoint_pageserver_id_arg.clone())
|
||||
.arg(safekeepers_arg)
|
||||
.arg(safekeepers_arg.clone())
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
.arg(allow_multiple.clone())
|
||||
@@ -1618,6 +1610,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
.arg(endpoint_pageserver_id_arg)
|
||||
.arg(safekeepers_arg)
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
)
|
||||
|
||||
@@ -499,6 +499,23 @@ impl Endpoint {
|
||||
.join(",")
|
||||
}
|
||||
|
||||
/// Map safekeepers ids to the actual connection strings.
|
||||
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in sk_ids {
|
||||
let sk = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
Ok(safekeeper_connstrings)
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
@@ -523,18 +540,7 @@ impl Endpoint {
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in safekeepers {
|
||||
let sk = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
// if it is present, read it and pass to compute_ctl
|
||||
@@ -592,7 +598,6 @@ impl Endpoint {
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
primary_is_running: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -741,6 +746,7 @@ impl Endpoint {
|
||||
&self,
|
||||
mut pageservers: Vec<(Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
@@ -775,6 +781,12 @@ impl Endpoint {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
if let Some(safekeepers) = safekeepers {
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
spec.safekeeper_connstrings = safekeeper_connstrings;
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
|
||||
@@ -17,8 +17,7 @@ use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{
|
||||
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
|
||||
TimelineInfo,
|
||||
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -397,28 +396,6 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let config = Self::parse_config(settings.clone())?;
|
||||
|
||||
let request = models::TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
||||
generation,
|
||||
config,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
// Placement policy is not meaningful for creations not done via storage controller
|
||||
placement_policy: None,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
Ok(self.http_client.tenant_create(&request).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -5,12 +5,11 @@ use crate::{
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
|
||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
use futures::StreamExt;
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
use std::{str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
|
||||
TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -21,7 +21,7 @@ use utils::id::{NodeId, TenantId};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
@@ -110,12 +110,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
config: String,
|
||||
},
|
||||
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
||||
/// alternative to the storage controller's scheduling optimization behavior.
|
||||
TenantScatter {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Print details about a particular tenant, including all its shards' states.
|
||||
TenantDescribe {
|
||||
#[arg(long)]
|
||||
@@ -342,14 +336,18 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Command::TenantCreate { tenant_id } => {
|
||||
vps_client
|
||||
.tenant_create(&TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
storcon_client
|
||||
.dispatch(
|
||||
Method::POST,
|
||||
"v1/tenant".to_string(),
|
||||
Some(TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantDelete { tenant_id } => {
|
||||
@@ -498,88 +496,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantScatter { tenant_id } => {
|
||||
// Find the shards
|
||||
let locate_response = storcon_client
|
||||
.dispatch::<(), TenantLocateResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = locate_response.shards;
|
||||
|
||||
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
let shard_count = shards.len();
|
||||
for s in shards {
|
||||
let entry = node_to_shards.entry(s.node_id).or_default();
|
||||
entry.push(s.shard_id);
|
||||
}
|
||||
|
||||
// Load list of available nodes
|
||||
let nodes_resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for node in nodes_resp {
|
||||
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
||||
node_to_shards.entry(node.id).or_default();
|
||||
}
|
||||
}
|
||||
|
||||
let max_shard_per_node = shard_count / node_to_shards.len();
|
||||
|
||||
loop {
|
||||
let mut migrate_shard = None;
|
||||
for shards in node_to_shards.values_mut() {
|
||||
if shards.len() > max_shard_per_node {
|
||||
// Pick the emptiest
|
||||
migrate_shard = Some(shards.pop().unwrap());
|
||||
}
|
||||
}
|
||||
let Some(migrate_shard) = migrate_shard else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Pick the emptiest node to migrate to
|
||||
let mut destinations = node_to_shards
|
||||
.iter()
|
||||
.map(|(k, v)| (k, v.len()))
|
||||
.collect::<Vec<_>>();
|
||||
destinations.sort_by_key(|i| i.1);
|
||||
let (destination_node, destination_count) = *destinations.first().unwrap();
|
||||
if destination_count + 1 > max_shard_per_node {
|
||||
// Even the emptiest destination doesn't have space: we're done
|
||||
break;
|
||||
}
|
||||
let destination_node = *destination_node;
|
||||
|
||||
node_to_shards
|
||||
.get_mut(&destination_node)
|
||||
.unwrap()
|
||||
.push(migrate_shard);
|
||||
|
||||
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: migrate_shard,
|
||||
node_id: destination_node,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
||||
}
|
||||
|
||||
// Spread the shards across the nodes
|
||||
}
|
||||
Command::TenantDescribe { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
|
||||
10
docker-compose/README.md
Normal file
10
docker-compose/README.md
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
# Example docker compose configuration
|
||||
|
||||
The configuration in this directory is used for testing Neon docker images: it is
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a minature Neon system, use `cargo neon` rather than container images.
|
||||
|
||||
This configuration does not start the storage controller, because the controller
|
||||
needs a way to reconfigure running computes, and no such thing exists in this setup.
|
||||
|
||||
@@ -23,11 +23,10 @@ echo "Page server is ready."
|
||||
echo "Create a tenant and timeline"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
-sb
|
||||
-X POST
|
||||
-X PUT
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_tenant_id\": \"${tenant_id}\"}"
|
||||
http://pageserver:9898/v1/tenant/
|
||||
-d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
|
||||
@@ -96,12 +96,6 @@ pub struct ComputeSpec {
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
// When we are starting a new replica in hot standby mode,
|
||||
// we need to know if the primary is running.
|
||||
// This is used to determine if replica should wait for
|
||||
// RUNNING_XACTS from primary or not.
|
||||
pub primary_is_running: Option<bool>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -103,9 +103,10 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("Failed to register maxrss_kb int gauge")
|
||||
});
|
||||
|
||||
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher,
|
||||
/// especially during many concurrent disk operations.
|
||||
pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
|
||||
&[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
|
||||
|
||||
pub struct BuildInfo {
|
||||
pub revision: &'static str,
|
||||
|
||||
@@ -11,6 +11,27 @@ use crate::{
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
pub new_tenant_id: TenantShardId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
|
||||
// If omitted, create a single shard with TenantShardId::unsharded()
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
||||
pub shard_parameters: ShardParameters,
|
||||
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub placement_policy: Option<PlacementPolicy>,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
@@ -280,4 +301,19 @@ mod test {
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reject_unknown_field() {
|
||||
let id = TenantId::generate();
|
||||
let create_request = serde_json::json!({
|
||||
"new_tenant_id": id.to_string(),
|
||||
"unknown_field": "unknown_value".to_string(),
|
||||
});
|
||||
let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("unknown field `unknown_field`"),
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,8 +160,9 @@ impl Key {
|
||||
key
|
||||
}
|
||||
|
||||
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
|
||||
/// Convert a 18B slice to a key. This function should not be used for 16B metadata keys because `field2` is handled differently.
|
||||
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). There are some restrictions on `field2`,
|
||||
/// and therefore not all 18B slices are valid page server keys.
|
||||
pub fn from_slice(b: &[u8]) -> Self {
|
||||
Key {
|
||||
field1: b[0],
|
||||
@@ -173,7 +174,7 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Convert a key to a 18B slice. This function should not be used for getting a 16B metadata key because `field2` is handled differently.
|
||||
/// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
|
||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||
buf[0] = self.field1;
|
||||
|
||||
@@ -25,7 +25,6 @@ use utils::{
|
||||
serde_system_time,
|
||||
};
|
||||
|
||||
use crate::controller_api::PlacementPolicy;
|
||||
use crate::{
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
@@ -271,28 +270,6 @@ impl Default for ShardParameters {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
pub new_tenant_id: TenantShardId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
|
||||
// If omitted, create a single shard with TenantShardId::unsharded()
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
||||
pub shard_parameters: ShardParameters,
|
||||
|
||||
// This parameter is only meaningful in requests sent to the storage controller
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub placement_policy: Option<PlacementPolicy>,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||
/// simpler types.
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
||||
@@ -547,10 +524,6 @@ pub struct LocationConfigListResponse {
|
||||
pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TenantCreateResponse(pub TenantId);
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct StatusResponse {
|
||||
pub id: NodeId,
|
||||
@@ -607,31 +580,6 @@ impl TenantConfigRequest {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
#[serde(default)]
|
||||
pub config: TenantAttachConfig,
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
/// its usage inside `TenantAttachRequest`.
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantAttachConfig {
|
||||
#[serde(flatten)]
|
||||
allowing_unknown_fields: TenantConfig,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantAttachConfig {
|
||||
type Target = TenantConfig;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.allowing_unknown_fields
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
|
||||
@@ -650,8 +598,7 @@ pub struct TenantInfo {
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -1478,7 +1425,7 @@ mod tests {
|
||||
state: TenantState::Active,
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: None,
|
||||
generation: 1,
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -1488,7 +1435,8 @@ mod tests {
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
}
|
||||
},
|
||||
"generation" : 1
|
||||
});
|
||||
|
||||
let original_broken = TenantInfo {
|
||||
@@ -1499,7 +1447,7 @@ mod tests {
|
||||
},
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: None,
|
||||
generation: 1,
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
@@ -1513,7 +1461,8 @@ mod tests {
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
}
|
||||
},
|
||||
"generation" : 1
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
@@ -1531,18 +1480,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_reject_unknown_field() {
|
||||
let id = TenantId::generate();
|
||||
let create_request = json!({
|
||||
"new_tenant_id": id.to_string(),
|
||||
"unknown_field": "unknown_value".to_string(),
|
||||
});
|
||||
let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("unknown field `unknown_field`"),
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
|
||||
let id = TenantId::generate();
|
||||
let config_request = json!({
|
||||
"tenant_id": id.to_string(),
|
||||
@@ -1554,18 +1491,6 @@ mod tests {
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
|
||||
let attach_request = json!({
|
||||
"config": {
|
||||
"unknown_field": "unknown_value".to_string(),
|
||||
},
|
||||
});
|
||||
let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("unknown field `unknown_field`"),
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -144,20 +144,7 @@ impl PgConnectionConfig {
|
||||
// implement and this function is hardly a bottleneck. The function is only called around
|
||||
// establishing a new connection.
|
||||
#[allow(unstable_name_collisions)]
|
||||
config.options(
|
||||
&self
|
||||
.options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>(),
|
||||
);
|
||||
config.options(&encode_options(&self.options));
|
||||
}
|
||||
config
|
||||
}
|
||||
@@ -178,6 +165,21 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unstable_name_collisions)]
|
||||
fn encode_options(options: &[String]) -> String {
|
||||
options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
@@ -206,7 +208,7 @@ impl fmt::Debug for PgConnectionConfig {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_pg_connection_config {
|
||||
use crate::PgConnectionConfig;
|
||||
use crate::{encode_options, PgConnectionConfig};
|
||||
use once_cell::sync::Lazy;
|
||||
use url::Host;
|
||||
|
||||
@@ -255,18 +257,12 @@ mod tests_pg_connection_config {
|
||||
|
||||
#[test]
|
||||
fn test_with_options() {
|
||||
let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
|
||||
"hello",
|
||||
"world",
|
||||
"with space",
|
||||
"and \\ backslashes",
|
||||
let options = encode_options(&[
|
||||
"hello".to_owned(),
|
||||
"world".to_owned(),
|
||||
"with space".to_owned(),
|
||||
"and \\ backslashes".to_owned(),
|
||||
]);
|
||||
assert_eq!(cfg.host(), &*STUB_HOST);
|
||||
assert_eq!(cfg.port(), 123);
|
||||
assert_eq!(cfg.raw_address(), "stub.host.example:123");
|
||||
assert_eq!(
|
||||
cfg.to_tokio_postgres_config().get_options(),
|
||||
Some("hello world with\\ space and\\ \\\\\\ backslashes")
|
||||
);
|
||||
assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -356,6 +356,28 @@ impl CheckPoint {
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Advance next multi-XID/offset to those given in arguments.
|
||||
///
|
||||
/// It's important that this handles wraparound correctly. This should match the
|
||||
/// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function.
|
||||
///
|
||||
/// Returns 'true' if the Checkpoint was updated.
|
||||
pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
|
||||
let mut modified = false;
|
||||
|
||||
if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 {
|
||||
self.nextMulti = multi_xid;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 {
|
||||
self.nextMultiOffset = multi_offset;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
modified
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate new, empty WAL segment, with correct block headers at the first
|
||||
|
||||
@@ -202,6 +202,53 @@ pub fn test_update_next_xid() {
|
||||
assert_eq!(checkpoint.nextXid.value, 2048);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_update_next_multixid() {
|
||||
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
|
||||
|
||||
// simple case
|
||||
checkpoint.nextMulti = 20;
|
||||
checkpoint.nextMultiOffset = 20;
|
||||
checkpoint.update_next_multixid(1000, 2000);
|
||||
assert_eq!(checkpoint.nextMulti, 1000);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 2000);
|
||||
|
||||
// No change
|
||||
checkpoint.update_next_multixid(500, 900);
|
||||
assert_eq!(checkpoint.nextMulti, 1000);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 2000);
|
||||
|
||||
// Close to wraparound, but not wrapped around yet
|
||||
checkpoint.nextMulti = 0xffff0000;
|
||||
checkpoint.nextMultiOffset = 0xfffe0000;
|
||||
checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff);
|
||||
assert_eq!(checkpoint.nextMulti, 0xffff00ff);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
|
||||
|
||||
// Wraparound
|
||||
checkpoint.update_next_multixid(1, 900);
|
||||
assert_eq!(checkpoint.nextMulti, 1);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 900);
|
||||
|
||||
// Wraparound nextMulti to 0.
|
||||
//
|
||||
// It's a bit surprising that nextMulti can be 0, because that's a special value
|
||||
// (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound:
|
||||
// nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips
|
||||
// the 0 and the next multi-xid actually assigned is 1.
|
||||
checkpoint.nextMulti = 0xffff0000;
|
||||
checkpoint.nextMultiOffset = 0xfffe0000;
|
||||
checkpoint.update_next_multixid(0, 0xfffe00ff);
|
||||
assert_eq!(checkpoint.nextMulti, 0);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
|
||||
|
||||
// Wraparound nextMultiOffset to 0
|
||||
checkpoint.update_next_multixid(0, 0);
|
||||
assert_eq!(checkpoint.nextMulti, 0);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_encode_logical_message() {
|
||||
let expected = [
|
||||
|
||||
@@ -34,7 +34,7 @@ use utils::backoff;
|
||||
|
||||
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
|
||||
use crate::{
|
||||
error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
|
||||
config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing,
|
||||
ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
|
||||
277
libs/remote_storage/src/config.rs
Normal file
277
libs/remote_storage/src/config.rs
Normal file
@@ -0,0 +1,277 @@
|
||||
use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
|
||||
|
||||
use anyhow::bail;
|
||||
use aws_sdk_s3::types::StorageClass;
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// The storage connection configuration.
|
||||
#[serde(flatten)]
|
||||
pub storage: RemoteStorageKind,
|
||||
/// A common timeout enforced for all requests after concurrency limiter permit has been
|
||||
/// acquired.
|
||||
#[serde(
|
||||
with = "humantime_serde",
|
||||
default = "default_timeout",
|
||||
skip_serializing_if = "is_default_timeout"
|
||||
)]
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
fn default_timeout() -> Duration {
|
||||
RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
|
||||
fn is_default_timeout(d: &Duration) -> bool {
|
||||
*d == RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs { local_path: Utf8PathBuf },
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
/// Azure Blob based storage, storing all files in the container
|
||||
/// specified by the config
|
||||
AzureContainer(AzureConfig),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
#[serde(default = "default_remote_storage_s3_concurrency_limit")]
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
#[serde(default = "default_max_keys_per_list_response")]
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
#[serde(
|
||||
deserialize_with = "deserialize_storage_class",
|
||||
serialize_with = "serialize_storage_class",
|
||||
default
|
||||
)]
|
||||
pub upload_storage_class: Option<StorageClass>,
|
||||
}
|
||||
|
||||
fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn default_max_keys_per_list_response() -> Option<i32> {
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct AzureConfig {
|
||||
/// Name of the container to connect to.
|
||||
pub container_name: String,
|
||||
/// Name of the storage account the container is inside of
|
||||
pub storage_account: Option<String>,
|
||||
/// The region where the bucket is located at.
|
||||
pub container_region: String,
|
||||
/// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
|
||||
pub prefix_in_container: Option<String>,
|
||||
/// Azure has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
|
||||
#[serde(default = "default_remote_storage_azure_concurrency_limit")]
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
#[serde(default = "default_max_keys_per_list_response")]
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
}
|
||||
|
||||
fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
|
||||
NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
|
||||
}
|
||||
|
||||
impl Debug for AzureConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AzureConfig")
|
||||
.field("bucket_name", &self.container_name)
|
||||
.field("storage_account", &self.storage_account)
|
||||
.field("bucket_region", &self.container_region)
|
||||
.field("prefix_in_container", &self.prefix_in_container)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
|
||||
deserializer: D,
|
||||
) -> Result<Option<StorageClass>, D::Error> {
|
||||
Option::<String>::deserialize(deserializer).and_then(|s| {
|
||||
if let Some(s) = s {
|
||||
use serde::de::Error;
|
||||
let storage_class = StorageClass::from_str(&s).expect("infallible");
|
||||
#[allow(deprecated)]
|
||||
if matches!(storage_class, StorageClass::Unknown(_)) {
|
||||
return Err(D::Error::custom(format!(
|
||||
"Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
|
||||
StorageClass::values()
|
||||
)));
|
||||
}
|
||||
Ok(Some(storage_class))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_storage_class<S: serde::Serializer>(
|
||||
val: &Option<StorageClass>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
let val = val.as_ref().map(StorageClass::as_str);
|
||||
Option::<&str>::serialize(&val, serializer)
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
||||
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let document: toml_edit::Document = match toml {
|
||||
toml_edit::Item::Table(toml) => toml.clone().into(),
|
||||
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
||||
toml.clone().into_table().into()
|
||||
}
|
||||
_ => bail!("toml not a table or inline table"),
|
||||
};
|
||||
|
||||
if document.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(toml_edit::de::from_document(document)?))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||
RemoteStorageConfig::from_toml(toml.as_item())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_localfs_config_with_timeout() {
|
||||
let input = "local_path = '.'
|
||||
timeout = '5s'";
|
||||
|
||||
let config = parse(input).unwrap().expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs {
|
||||
local_path: Utf8PathBuf::from(".")
|
||||
},
|
||||
timeout: Duration::from_secs(5)
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_s3_parsing() {
|
||||
let toml = "\
|
||||
bucket_name = 'foo-bar'
|
||||
bucket_region = 'eu-central-1'
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap().expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: "foo-bar".into(),
|
||||
bucket_region: "eu-central-1".into(),
|
||||
prefix_in_bucket: None,
|
||||
endpoint: None,
|
||||
concurrency_limit: default_remote_storage_s3_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
upload_storage_class: Some(StorageClass::IntelligentTiering),
|
||||
}),
|
||||
timeout: Duration::from_secs(7)
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_azure_parsing() {
|
||||
let toml = "\
|
||||
container_name = 'foo-bar'
|
||||
container_region = 'westeurope'
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap().expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AzureContainer(AzureConfig {
|
||||
container_name: "foo-bar".into(),
|
||||
storage_account: None,
|
||||
container_region: "westeurope".into(),
|
||||
prefix_in_container: None,
|
||||
concurrency_limit: default_remote_storage_azure_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
}),
|
||||
timeout: Duration::from_secs(7)
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod azure_blob;
|
||||
mod config;
|
||||
mod error;
|
||||
mod local_fs;
|
||||
mod metrics;
|
||||
@@ -18,17 +19,10 @@ mod simulate_failures;
|
||||
mod support;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fmt::Debug,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
pin::Pin,
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use aws_sdk_s3::types::StorageClass;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use bytes::Bytes;
|
||||
@@ -44,6 +38,8 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config};
|
||||
|
||||
/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
|
||||
pub use azure_core::Etag;
|
||||
|
||||
@@ -525,168 +521,6 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// The storage connection configuration.
|
||||
#[serde(flatten)]
|
||||
pub storage: RemoteStorageKind,
|
||||
/// A common timeout enforced for all requests after concurrency limiter permit has been
|
||||
/// acquired.
|
||||
#[serde(with = "humantime_serde", default = "default_timeout")]
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
fn default_timeout() -> Duration {
|
||||
RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs { local_path: Utf8PathBuf },
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
/// Azure Blob based storage, storing all files in the container
|
||||
/// specified by the config
|
||||
AzureContainer(AzureConfig),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq, serde::Deserialize)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
#[serde(default = "default_remote_storage_s3_concurrency_limit")]
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
#[serde(default = "default_max_keys_per_list_response")]
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
#[serde(deserialize_with = "deserialize_storage_class", default)]
|
||||
pub upload_storage_class: Option<StorageClass>,
|
||||
}
|
||||
|
||||
fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn default_max_keys_per_list_response() -> Option<i32> {
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct AzureConfig {
|
||||
/// Name of the container to connect to.
|
||||
pub container_name: String,
|
||||
/// Name of the storage account the container is inside of
|
||||
pub storage_account: Option<String>,
|
||||
/// The region where the bucket is located at.
|
||||
pub container_region: String,
|
||||
/// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
|
||||
pub prefix_in_container: Option<String>,
|
||||
/// Azure has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
|
||||
#[serde(default = "default_remote_storage_azure_concurrency_limit")]
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
#[serde(default = "default_max_keys_per_list_response")]
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
}
|
||||
|
||||
fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
|
||||
NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
|
||||
}
|
||||
|
||||
impl Debug for AzureConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AzureConfig")
|
||||
.field("bucket_name", &self.container_name)
|
||||
.field("storage_account", &self.storage_account)
|
||||
.field("bucket_region", &self.container_region)
|
||||
.field("prefix_in_container", &self.prefix_in_container)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
|
||||
deserializer: D,
|
||||
) -> Result<Option<StorageClass>, D::Error> {
|
||||
Option::<String>::deserialize(deserializer).and_then(|s| {
|
||||
if let Some(s) = s {
|
||||
use serde::de::Error;
|
||||
let storage_class = StorageClass::from_str(&s).expect("infallible");
|
||||
#[allow(deprecated)]
|
||||
if matches!(storage_class, StorageClass::Unknown(_)) {
|
||||
return Err(D::Error::custom(format!(
|
||||
"Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
|
||||
StorageClass::values()
|
||||
)));
|
||||
}
|
||||
Ok(Some(storage_class))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
||||
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let document: toml_edit::Document = match toml {
|
||||
toml_edit::Item::Table(toml) => toml.clone().into(),
|
||||
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
||||
toml.clone().into_table().into()
|
||||
}
|
||||
_ => bail!("toml not a table or inline table"),
|
||||
};
|
||||
|
||||
if document.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(toml_edit::de::from_document(document)?))
|
||||
}
|
||||
}
|
||||
|
||||
struct ConcurrencyLimiter {
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
@@ -733,11 +567,6 @@ impl ConcurrencyLimiter {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||
RemoteStorageConfig::from_toml(toml.as_item())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_object_name() {
|
||||
let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
|
||||
@@ -759,77 +588,4 @@ mod tests {
|
||||
let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
|
||||
assert_eq!(err.to_string(), "Path \"/\" is not relative");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_localfs_config_with_timeout() {
|
||||
let input = "local_path = '.'
|
||||
timeout = '5s'";
|
||||
|
||||
let config = parse(input).unwrap().expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs {
|
||||
local_path: Utf8PathBuf::from(".")
|
||||
},
|
||||
timeout: Duration::from_secs(5)
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_s3_parsing() {
|
||||
let toml = "\
|
||||
bucket_name = 'foo-bar'
|
||||
bucket_region = 'eu-central-1'
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap().expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: "foo-bar".into(),
|
||||
bucket_region: "eu-central-1".into(),
|
||||
prefix_in_bucket: None,
|
||||
endpoint: None,
|
||||
concurrency_limit: default_remote_storage_s3_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
upload_storage_class: Some(StorageClass::IntelligentTiering),
|
||||
}),
|
||||
timeout: Duration::from_secs(7)
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_azure_parsing() {
|
||||
let toml = "\
|
||||
container_name = 'foo-bar'
|
||||
container_region = 'westeurope'
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap().expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AzureContainer(AzureConfig {
|
||||
container_name: "foo-bar".into(),
|
||||
storage_account: None,
|
||||
container_region: "westeurope".into(),
|
||||
prefix_in_container: None,
|
||||
concurrency_limit: default_remote_storage_azure_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
}),
|
||||
timeout: Duration::from_secs(7)
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,12 +46,12 @@ use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
config::S3Config,
|
||||
error::Cancelled,
|
||||
metrics::{start_counting_cancelled_wait, start_measuring_requests},
|
||||
support::PermitCarrying,
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use crate::metrics::AttemptOutcome;
|
||||
|
||||
@@ -9,20 +9,11 @@ use serde::{Deserialize, Serialize};
|
||||
/// numbers are used.
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub enum Generation {
|
||||
// Generations with this magic value will not add a suffix to S3 keys, and will not
|
||||
// be included in persisted index_part.json. This value is only to be used
|
||||
// during migration from pre-generation metadata to generation-aware metadata,
|
||||
// and should eventually go away.
|
||||
//
|
||||
// A special Generation is used rather than always wrapping Generation in an Option,
|
||||
// so that code handling generations doesn't have to be aware of the legacy
|
||||
// case everywhere it touches a generation.
|
||||
// The None Generation is used in the metadata of layers written before generations were
|
||||
// introduced. A running Tenant always has a valid generation, but the layer metadata may
|
||||
// include None generations.
|
||||
None,
|
||||
// Generations with this magic value may never be used to construct S3 keys:
|
||||
// we will panic if someone tries to. This is for Tenants in the "Broken" state,
|
||||
// so that we can satisfy their constructor with a Generation without risking
|
||||
// a code bug using it in an S3 write (broken tenants should never write)
|
||||
Broken,
|
||||
|
||||
Valid(u32),
|
||||
}
|
||||
|
||||
@@ -42,11 +33,6 @@ impl Generation {
|
||||
Self::None
|
||||
}
|
||||
|
||||
// Create a new generation that will panic if you try to use get_suffix
|
||||
pub fn broken() -> Self {
|
||||
Self::Broken
|
||||
}
|
||||
|
||||
pub const fn new(v: u32) -> Self {
|
||||
Self::Valid(v)
|
||||
}
|
||||
@@ -60,9 +46,6 @@ impl Generation {
|
||||
match self {
|
||||
Self::Valid(v) => GenerationFileSuffix(Some(*v)),
|
||||
Self::None => GenerationFileSuffix(None),
|
||||
Self::Broken => {
|
||||
panic!("Tried to use a broken generation");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,7 +69,6 @@ impl Generation {
|
||||
}
|
||||
}
|
||||
Self::None => Self::None,
|
||||
Self::Broken => panic!("Attempted to use a broken generation"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +77,6 @@ impl Generation {
|
||||
match self {
|
||||
Self::Valid(n) => Self::Valid(*n + 1),
|
||||
Self::None => Self::Valid(1),
|
||||
Self::Broken => panic!("Attempted to use a broken generation"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +109,7 @@ impl Serialize for Generation {
|
||||
if let Self::Valid(v) = self {
|
||||
v.serialize(serializer)
|
||||
} else {
|
||||
// We should never be asked to serialize a None or Broken. Structures
|
||||
// We should never be asked to serialize a None. Structures
|
||||
// that include an optional generation should convert None to an
|
||||
// Option<Generation>::None
|
||||
Err(serde::ser::Error::custom(
|
||||
@@ -159,9 +140,6 @@ impl Debug for Generation {
|
||||
Self::None => {
|
||||
write!(f, "<none>")
|
||||
}
|
||||
Self::Broken => {
|
||||
write!(f, "<broken>")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,22 +8,15 @@ use super::error::ApiError;
|
||||
pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<T, ApiError> {
|
||||
json_request_or_empty_body(request)
|
||||
.await?
|
||||
.context("missing request body")
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
|
||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<Option<T>, ApiError> {
|
||||
let body = hyper::body::aggregate(request.body_mut())
|
||||
.await
|
||||
.context("Failed to read request body")
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
if body.remaining() == 0 {
|
||||
return Ok(None);
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"missing request body"
|
||||
)));
|
||||
}
|
||||
|
||||
let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
|
||||
@@ -31,7 +24,6 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
serde_path_to_error::deserialize(&mut deser)
|
||||
// intentionally stringify because the debug version is not helpful in python logs
|
||||
.map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
|
||||
.map(Some)
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(clippy::todo)]
|
||||
|
||||
use std::ffi::CString;
|
||||
|
||||
use crate::{
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! ```
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
@@ -188,6 +189,7 @@ impl Request {
|
||||
manager
|
||||
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||
.await
|
||||
.context("request_redo")
|
||||
}
|
||||
|
||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
|
||||
@@ -205,15 +205,6 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
|
||||
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
|
||||
self.request(Method::POST, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// The tenant deletion API can return 202 if deletion is incomplete, or
|
||||
/// 404 if it is complete. Callers are responsible for checking the status
|
||||
/// code and retrying. Error codes other than 404 will return Err().
|
||||
|
||||
@@ -83,10 +83,18 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
||||
|
||||
// The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001
|
||||
|
||||
// Handle generation number `-00000001` part
|
||||
if lsns.last().expect("should").len() == 8 {
|
||||
lsns.pop();
|
||||
}
|
||||
|
||||
// Handle version number `-v1` part
|
||||
if lsns.last().expect("should").starts_with('v') {
|
||||
lsns.pop();
|
||||
}
|
||||
|
||||
if lsns.len() == 1 {
|
||||
lsns.push(lsns[0]);
|
||||
}
|
||||
|
||||
@@ -33,15 +33,10 @@ use utils::{
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||
use crate::{tenant::config::TenantConf, virtual_file};
|
||||
use crate::{
|
||||
TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
|
||||
TIMELINE_DELETE_MARK_SUFFIX,
|
||||
};
|
||||
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
|
||||
@@ -812,15 +807,11 @@ impl PageServerConf {
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
///
|
||||
/// Legacy: superseded by tenant_location_config_path. Eventually
|
||||
/// remove this function.
|
||||
pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
|
||||
}
|
||||
|
||||
pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
/// where certain tenant's LocationConf be stored.
|
||||
pub(crate) fn tenant_location_config_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||
}
|
||||
@@ -855,14 +846,6 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_deleted_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> Utf8PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
|
||||
@@ -382,17 +382,6 @@ pub enum DeletionQueueError {
|
||||
}
|
||||
|
||||
impl DeletionQueueClient {
|
||||
pub(crate) fn broken() -> Self {
|
||||
// Channels whose receivers are immediately dropped.
|
||||
let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
|
||||
Self {
|
||||
tx,
|
||||
executor_tx,
|
||||
lsn_table: Arc::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// This is cancel-safe. If you drop the future before it completes, the message
|
||||
/// is not pushed, although in the context of the deletion queue it doesn't matter: once
|
||||
/// we decide to do a deletion the decision is always final.
|
||||
|
||||
@@ -236,6 +236,13 @@ paths:
|
||||
type: string
|
||||
format: date-time
|
||||
description: A timestamp to get the LSN
|
||||
- name: with_lease
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: Whether to grant a lease to the corresponding LSN. Default to false.
|
||||
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
@@ -360,16 +367,7 @@ paths:
|
||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||
"409":
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
and hence this `/attach` call has been rejected.
|
||||
|
||||
Some examples of how this can happen:
|
||||
- tenant was created on this pageserver
|
||||
- tenant attachment was started by an earlier call to `/attach`.
|
||||
|
||||
Callers should poll the tenant status's `attachment_status` field,
|
||||
like for status 202. See the longer description for `POST /attach`
|
||||
for details.
|
||||
The tenant is already being modified, perhaps by a concurrent call to this API
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -755,8 +753,6 @@ components:
|
||||
For example this can be caused by s3 being unreachable. The retry may be implemented
|
||||
with call to detach, though it would be better to not automate it and inspec failed state
|
||||
manually before proceeding with a retry.
|
||||
|
||||
See the tenant `/attach` endpoint for more information.
|
||||
type: object
|
||||
required:
|
||||
- slug
|
||||
@@ -1029,6 +1025,10 @@ components:
|
||||
kind:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
valid_until:
|
||||
type: string
|
||||
format: date-time
|
||||
description: The expiration time of the granted lease.
|
||||
|
||||
LsnLease:
|
||||
type: object
|
||||
|
||||
@@ -21,6 +21,7 @@ use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
@@ -30,13 +31,11 @@ use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLocationConfigRequest,
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::ShardCount;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -50,12 +49,10 @@ use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::prometheus_metrics_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
@@ -77,13 +74,12 @@ use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::SpawnMode;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
||||
TimelineInfo,
|
||||
};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
@@ -329,14 +325,11 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
|
||||
use crate::tenant::delete::DeleteTenantError::*;
|
||||
impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTenantError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTenantError::*;
|
||||
match value {
|
||||
Get(g) => ApiError::from(g),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
SlotError(e) => e.into(),
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
}
|
||||
@@ -731,6 +724,8 @@ async fn get_lsn_by_timestamp_handler(
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let timeline =
|
||||
@@ -739,10 +734,15 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
||||
.await?;
|
||||
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
struct Result {
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(flatten)]
|
||||
lease: Option<LsnLease>,
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
@@ -750,11 +750,28 @@ async fn get_lsn_by_timestamp_handler(
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
let result = Result { lsn, kind };
|
||||
|
||||
let lease = if with_lease {
|
||||
timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
|
||||
.inspect_err(|_| {
|
||||
warn!("fail to grant a lease to {}", lsn);
|
||||
})
|
||||
.ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let result = Result { lsn, kind, lease };
|
||||
let valid_until = result
|
||||
.lease
|
||||
.as_ref()
|
||||
.map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string());
|
||||
tracing::info!(
|
||||
lsn=?result.lsn,
|
||||
kind=%result.kind,
|
||||
timestamp=%timestamp_raw,
|
||||
valid_until=?valid_until,
|
||||
"lsn_by_timestamp finished"
|
||||
);
|
||||
json_response(StatusCode::OK, result)
|
||||
@@ -799,58 +816,6 @@ async fn get_timestamp_of_lsn_handler(
|
||||
}
|
||||
}
|
||||
|
||||
async fn tenant_attach_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
let tenant_conf = match &maybe_body {
|
||||
Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
|
||||
None => TenantConfOpt::default(),
|
||||
};
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
info!("Handling tenant attach {tenant_id}");
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let shard_params = ShardParameters::default();
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
|
||||
.await?;
|
||||
|
||||
let Some(tenant) = tenant else {
|
||||
// This should never happen: indicates a bug in upsert_location
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Upsert succeeded but didn't return tenant!"
|
||||
)));
|
||||
};
|
||||
|
||||
// We might have successfully constructed a Tenant, but it could still
|
||||
// end up in a broken state:
|
||||
if let TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} = tenant.current_state()
|
||||
{
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Tenant state is Broken: {reason}"
|
||||
)));
|
||||
}
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn timeline_delete_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -881,26 +846,6 @@ async fn timeline_delete_handler(
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_detach_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
// This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
state
|
||||
.tenant_manager
|
||||
.detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_reset_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -940,7 +885,9 @@ async fn tenant_list_handler(
|
||||
state: state.clone(),
|
||||
current_physical_size: None,
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: (*gen).into(),
|
||||
generation: (*gen)
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
@@ -988,7 +935,10 @@ async fn tenant_status(
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: tenant.generation().into(),
|
||||
generation: tenant
|
||||
.generation()
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
},
|
||||
walredo: tenant.wal_redo_manager_status(),
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
@@ -1285,75 +1235,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Helper for requests that may take a generation, which is mandatory
|
||||
/// when control_plane_api is set, but otherwise defaults to Generation::none()
|
||||
fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
|
||||
if state.conf.control_plane_api.is_some() {
|
||||
req_gen
|
||||
.map(Generation::new)
|
||||
.ok_or(ApiError::BadRequest(anyhow!(
|
||||
"generation attribute missing"
|
||||
)))
|
||||
} else {
|
||||
// Legacy mode: all tenants operate with no generation
|
||||
Ok(Generation::none())
|
||||
}
|
||||
}
|
||||
|
||||
async fn tenant_create_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
let target_tenant_id = request_data.new_tenant_id;
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let _timer = STORAGE_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
|
||||
.expect("bug")
|
||||
.start_timer();
|
||||
|
||||
let tenant_conf =
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let generation = get_request_generation(state, request_data.generation)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let location_conf =
|
||||
LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
|
||||
|
||||
let new_tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
target_tenant_id,
|
||||
location_conf,
|
||||
None,
|
||||
SpawnMode::Create,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let Some(new_tenant) = new_tenant else {
|
||||
// This should never happen: indicates a bug in upsert_location
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Upsert succeeded but didn't return tenant!"
|
||||
)));
|
||||
};
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
new_tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
|
||||
)
|
||||
}
|
||||
|
||||
async fn get_tenant_config_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1705,6 +1586,14 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
||||
if !cfg!(feature = "testing") {
|
||||
return Err(ApiError::InternalServerError(anyhow!(
|
||||
"enhanced_gc_bottom_most_compaction is only available in testing mode"
|
||||
)));
|
||||
}
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
@@ -2651,7 +2540,6 @@ pub fn make_router(
|
||||
api_handler(r, reload_auth_validation_keys_handler)
|
||||
})
|
||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||
.get("/v1/tenant/:tenant_shard_id", |r| {
|
||||
api_handler(r, tenant_status)
|
||||
})
|
||||
@@ -2689,12 +2577,6 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||
api_handler(r, timeline_create_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/attach", |r| {
|
||||
api_handler(r, tenant_attach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||
api_handler(r, tenant_detach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
|
||||
api_handler(r, tenant_reset_handler)
|
||||
})
|
||||
|
||||
@@ -113,11 +113,7 @@ pub async fn shutdown_pageserver(
|
||||
}
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
/// Full path: `tenants/<tenant_id>/config-v1`.
|
||||
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
|
||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||
|
||||
@@ -53,9 +53,6 @@ pub(crate) enum StorageTimeOperation {
|
||||
|
||||
#[strum(serialize = "find gc cutoffs")]
|
||||
FindGcCutoffs,
|
||||
|
||||
#[strum(serialize = "create tenant")]
|
||||
CreateTenant,
|
||||
}
|
||||
|
||||
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||
@@ -545,6 +542,15 @@ static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_valid_lsn_lease_count",
|
||||
"The number of valid leases after refreshing gc info.",
|
||||
&["tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -1436,6 +1442,46 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||
pub(crate) enum ComputeCommandKind {
|
||||
PageStreamV2,
|
||||
PageStream,
|
||||
Basebackup,
|
||||
GetLastRecordRlsn,
|
||||
Fullbackup,
|
||||
ImportBasebackup,
|
||||
ImportWal,
|
||||
LeaseLsn,
|
||||
Show,
|
||||
}
|
||||
|
||||
pub(crate) struct ComputeCommandCounters {
|
||||
map: EnumMap<ComputeCommandKind, IntCounter>,
|
||||
}
|
||||
|
||||
pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
|
||||
let inner = register_int_counter_vec!(
|
||||
"pageserver_compute_commands",
|
||||
"Number of compute -> pageserver commands processed",
|
||||
&["command"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
ComputeCommandCounters {
|
||||
map: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
|
||||
let command_str: &'static str = command.into();
|
||||
inner.with_label_values(&[command_str])
|
||||
})),
|
||||
}
|
||||
});
|
||||
|
||||
impl ComputeCommandCounters {
|
||||
pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
|
||||
&self.map[command]
|
||||
}
|
||||
}
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
@@ -2055,6 +2101,8 @@ pub(crate) struct TimelineMetrics {
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
/// Number of valid LSN leases.
|
||||
pub valid_lsn_lease_count_gauge: UIntGauge,
|
||||
shutdown: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
@@ -2153,6 +2201,10 @@ impl TimelineMetrics {
|
||||
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
|
||||
.build(&tenant_id, &shard_id, &timeline_id);
|
||||
|
||||
let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
@@ -2175,6 +2227,7 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
valid_lsn_lease_count_gauge,
|
||||
shutdown: std::sync::atomic::AtomicBool::default(),
|
||||
}
|
||||
}
|
||||
@@ -2224,6 +2277,7 @@ impl TimelineMetrics {
|
||||
}
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -2932,4 +2986,5 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&tenant_throttling::TIMELINE_GET);
|
||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
||||
}
|
||||
|
||||
@@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
@@ -1554,6 +1554,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStreamV2)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
@@ -1579,6 +1583,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStream)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
@@ -1605,6 +1613,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
@@ -1662,6 +1674,11 @@ where
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::GetLastRecordRlsn)
|
||||
.inc();
|
||||
|
||||
async {
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
@@ -1723,6 +1740,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Fullbackup)
|
||||
.inc();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
@@ -1771,6 +1792,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::ImportBasebackup)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_import_basebackup(
|
||||
pgb,
|
||||
@@ -1818,6 +1843,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::ImportWal)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
|
||||
.await
|
||||
@@ -1855,6 +1884,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::LeaseLsn)
|
||||
.inc();
|
||||
|
||||
// The caller is responsible for providing correct lsn.
|
||||
let lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
@@ -1886,6 +1919,10 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Show)
|
||||
.inc();
|
||||
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
|
||||
@@ -55,11 +55,9 @@ use self::config::AttachedLocationConfig;
|
||||
use self::config::AttachmentMode;
|
||||
use self::config::LocationConf;
|
||||
use self::config::TenantConf;
|
||||
use self::delete::DeleteTenantFlow;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::mgr::GetActiveTenantError;
|
||||
use self::mgr::GetTenantError;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::upload::upload_index_part;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineCreateGuard;
|
||||
@@ -90,6 +88,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::walredo;
|
||||
use crate::InitializationOrder;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
@@ -137,7 +136,6 @@ pub mod remote_timeline_client;
|
||||
pub mod storage_layer;
|
||||
|
||||
pub mod config;
|
||||
pub mod delete;
|
||||
pub mod mgr;
|
||||
pub mod secondary;
|
||||
pub mod tasks;
|
||||
@@ -161,8 +159,6 @@ pub const TENANTS_SEGMENT_NAME: &str = "tenants";
|
||||
/// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
|
||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
|
||||
pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
|
||||
|
||||
/// References to shared objects that are passed into each tenant, such
|
||||
/// as the shared remote storage client and process initialization state.
|
||||
#[derive(Clone)]
|
||||
@@ -207,7 +203,6 @@ struct TimelinePreload {
|
||||
}
|
||||
|
||||
pub(crate) struct TenantPreload {
|
||||
deleting: bool,
|
||||
timelines: HashMap<TimelineId, TimelinePreload>,
|
||||
}
|
||||
|
||||
@@ -218,8 +213,6 @@ pub(crate) enum SpawnMode {
|
||||
Eager,
|
||||
/// Lazy activation in the background, with the option to skip the queue if the need comes up
|
||||
Lazy,
|
||||
/// Tenant has been created during the lifetime of this process
|
||||
Create,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -286,8 +279,6 @@ pub struct Tenant {
|
||||
/// background warmup.
|
||||
pub(crate) activate_now_sem: tokio::sync::Semaphore,
|
||||
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
|
||||
// Cancellation token fires when we have entered shutdown(). This is a parent of
|
||||
// Timelines' cancellation token.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
@@ -331,6 +322,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
|
||||
}
|
||||
|
||||
impl WalRedoManager {
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
match self {
|
||||
Self::Prod(mgr) => mgr.shutdown().await,
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||
match self {
|
||||
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||
@@ -351,7 +352,7 @@ impl WalRedoManager {
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<bytes::Bytes> {
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
Self::Prod(mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
@@ -654,10 +655,9 @@ impl Tenant {
|
||||
attached_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
) -> Arc<Tenant> {
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
@@ -806,9 +806,6 @@ impl Tenant {
|
||||
};
|
||||
|
||||
let preload = match &mode {
|
||||
SpawnMode::Create => {
|
||||
None
|
||||
},
|
||||
SpawnMode::Eager | SpawnMode::Lazy => {
|
||||
let _preload_timer = TENANT.preload.start_timer();
|
||||
let res = tenant_clone
|
||||
@@ -828,59 +825,10 @@ impl Tenant {
|
||||
// Remote preload is complete.
|
||||
drop(remote_load_completion);
|
||||
|
||||
let pending_deletion = {
|
||||
match DeleteTenantFlow::should_resume_deletion(
|
||||
conf,
|
||||
preload.as_ref().map(|p| p.deleting).unwrap_or(false),
|
||||
&tenant_clone,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(should_resume_deletion) => should_resume_deletion,
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
info!("pending_deletion {}", pending_deletion.is_some());
|
||||
|
||||
if let Some(deletion) = pending_deletion {
|
||||
// as we are no longer loading, signal completion by dropping
|
||||
// the completion while we resume deletion
|
||||
drop(_completion);
|
||||
let background_jobs_can_start =
|
||||
init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
info!("waiting for backgound jobs barrier");
|
||||
background.clone().wait().await;
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
let deleted = DeleteTenantFlow::resume_from_attach(
|
||||
deletion,
|
||||
&tenant_clone,
|
||||
preload,
|
||||
tenants,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Err(e) = deleted {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
|
||||
let attached = {
|
||||
let _attach_timer = match mode {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
|
||||
};
|
||||
tenant_clone.attach(preload, mode, &ctx).await
|
||||
let _attach_timer = Some(TENANT.attach.start_timer());
|
||||
tenant_clone.attach(preload, &ctx).await
|
||||
};
|
||||
|
||||
match attached {
|
||||
@@ -911,7 +859,7 @@ impl Tenant {
|
||||
}
|
||||
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
|
||||
);
|
||||
Ok(tenant)
|
||||
tenant
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
@@ -931,21 +879,13 @@ impl Tenant {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let deleting = other_keys.contains(TENANT_DELETED_MARKER_FILE_NAME);
|
||||
info!(
|
||||
"found {} timelines, deleting={}",
|
||||
remote_timeline_ids.len(),
|
||||
deleting
|
||||
);
|
||||
info!("found {} timelines", remote_timeline_ids.len(),);
|
||||
|
||||
for k in other_keys {
|
||||
if k != TENANT_DELETED_MARKER_FILE_NAME {
|
||||
warn!("Unexpected non timeline key {k}");
|
||||
}
|
||||
warn!("Unexpected non timeline key {k}");
|
||||
}
|
||||
|
||||
Ok(TenantPreload {
|
||||
deleting,
|
||||
timelines: Self::load_timeline_metadata(
|
||||
self,
|
||||
remote_timeline_ids,
|
||||
@@ -964,22 +904,14 @@ impl Tenant {
|
||||
async fn attach(
|
||||
self: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
|
||||
let preload = match (preload, mode) {
|
||||
(Some(p), _) => p,
|
||||
(None, SpawnMode::Create) => TenantPreload {
|
||||
deleting: false,
|
||||
timelines: HashMap::new(),
|
||||
},
|
||||
(None, _) => {
|
||||
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
|
||||
}
|
||||
let Some(preload) = preload else {
|
||||
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
|
||||
};
|
||||
|
||||
let mut timelines_to_resume_deletions = vec![];
|
||||
@@ -1211,30 +1143,6 @@ impl Tenant {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Create a placeholder Tenant object for a broken tenant
|
||||
pub fn create_broken_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
reason: String,
|
||||
) -> Arc<Tenant> {
|
||||
Arc::new(Tenant::new(
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: String::new(),
|
||||
},
|
||||
conf,
|
||||
AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
|
||||
// Shard identity isn't meaningful for a broken tenant: it's just a placeholder
|
||||
// to occupy the slot for this TenantShardId.
|
||||
ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
|
||||
None,
|
||||
tenant_shard_id,
|
||||
remote_storage,
|
||||
DeletionQueueClient::broken(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn load_timeline_metadata(
|
||||
self: &Arc<Tenant>,
|
||||
timeline_ids: HashSet<TimelineId>,
|
||||
@@ -1941,6 +1849,10 @@ impl Tenant {
|
||||
tracing::debug!("Waiting for tasks...");
|
||||
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
|
||||
|
||||
if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
|
||||
walredo_mgr.shutdown().await;
|
||||
}
|
||||
|
||||
// Wait for any in-flight operations to complete
|
||||
self.gate.close().await;
|
||||
|
||||
@@ -2215,6 +2127,7 @@ impl Tenant {
|
||||
// Upload an index from the parent: this is partly to provide freshness for the
|
||||
// child tenants that will copy it, and partly for general ease-of-debugging: there will
|
||||
// always be a parent shard index in the same generation as we wrote the child shard index.
|
||||
tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
|
||||
timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_file_changes()?;
|
||||
@@ -2222,12 +2135,14 @@ impl Tenant {
|
||||
|
||||
// Shut down the timeline's remote client: this means that the indices we write
|
||||
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||
tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
|
||||
timeline.remote_client.shutdown().await;
|
||||
|
||||
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
||||
// operation is rare, so it's simpler to just download it (and robustly guarantees that the index
|
||||
// we use here really is the remotely persistent one).
|
||||
tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
|
||||
let result = timeline.remote_client
|
||||
.download_index_file(&self.cancel)
|
||||
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
|
||||
@@ -2240,6 +2155,7 @@ impl Tenant {
|
||||
};
|
||||
|
||||
for child_shard in child_shards {
|
||||
tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
|
||||
upload_index_part(
|
||||
&self.remote_storage,
|
||||
child_shard,
|
||||
@@ -2554,6 +2470,10 @@ impl Tenant {
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
) -> Tenant {
|
||||
debug_assert!(
|
||||
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
|
||||
);
|
||||
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
tokio::spawn(async move {
|
||||
@@ -2628,7 +2548,6 @@ impl Tenant {
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
cancel: CancellationToken::default(),
|
||||
gate: Gate::default(),
|
||||
timeline_get_throttle: Arc::new(throttle::Throttle::new(
|
||||
@@ -2645,45 +2564,22 @@ impl Tenant {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<LocationConf> {
|
||||
let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
|
||||
let config_path = conf.tenant_location_config_path(tenant_shard_id);
|
||||
|
||||
if config_path.exists() {
|
||||
// New-style config takes precedence
|
||||
let deserialized = Self::read_config(&config_path)?;
|
||||
Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
|
||||
} else if legacy_config_path.exists() {
|
||||
// Upgrade path: found an old-style configuration only
|
||||
let deserialized = Self::read_config(&legacy_config_path)?;
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
for (key, item) in deserialized.iter() {
|
||||
match key {
|
||||
"tenant_config" => {
|
||||
tenant_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("Failed to parse config from file '{legacy_config_path}' as pageserver config"))?;
|
||||
}
|
||||
_ => bail!(
|
||||
"config file {legacy_config_path} has unrecognized pageserver option '{key}'"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
// Legacy configs are implicitly in attached state, and do not support sharding
|
||||
Ok(LocationConf::attached_single(
|
||||
tenant_conf,
|
||||
Generation::none(),
|
||||
&models::ShardParameters::default(),
|
||||
))
|
||||
} else {
|
||||
// FIXME If the config file is not found, assume that we're attaching
|
||||
// a detached tenant and config is passed via attach command.
|
||||
// https://github.com/neondatabase/neon/issues/1555
|
||||
// OR: we're loading after incomplete deletion that managed to remove config.
|
||||
info!(
|
||||
"tenant config not found in {} or {}",
|
||||
config_path, legacy_config_path
|
||||
);
|
||||
Ok(LocationConf::default())
|
||||
// The config should almost always exist for a tenant directory:
|
||||
// - When attaching a tenant, the config is the first thing we write
|
||||
// - When detaching a tenant, we atomically move the directory to a tmp location
|
||||
// before deleting contents.
|
||||
//
|
||||
// The very rare edge case that can result in a missing config is if we crash during attach
|
||||
// between creating directory and writing config. Callers should handle that as if the
|
||||
// directory didn't exist.
|
||||
anyhow::bail!("tenant config not found in {}", config_path);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2705,47 +2601,17 @@ impl Tenant {
|
||||
tenant_shard_id: &TenantShardId,
|
||||
location_conf: &LocationConf,
|
||||
) -> anyhow::Result<()> {
|
||||
let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
|
||||
let config_path = conf.tenant_location_config_path(tenant_shard_id);
|
||||
|
||||
Self::persist_tenant_config_at(
|
||||
tenant_shard_id,
|
||||
&config_path,
|
||||
&legacy_config_path,
|
||||
location_conf,
|
||||
)
|
||||
.await
|
||||
Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(super) async fn persist_tenant_config_at(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
config_path: &Utf8Path,
|
||||
legacy_config_path: &Utf8Path,
|
||||
location_conf: &LocationConf,
|
||||
) -> anyhow::Result<()> {
|
||||
if let LocationMode::Attached(attach_conf) = &location_conf.mode {
|
||||
// The modern-style LocationConf config file requires a generation to be set. In case someone
|
||||
// is running a pageserver without the infrastructure to set generations, write out the legacy-style
|
||||
// config file that only contains TenantConf.
|
||||
//
|
||||
// This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
|
||||
|
||||
if attach_conf.generation.is_none() {
|
||||
tracing::info!(
|
||||
"Running without generations, writing legacy-style tenant config file"
|
||||
);
|
||||
Self::persist_tenant_config_legacy(
|
||||
tenant_shard_id,
|
||||
legacy_config_path,
|
||||
&location_conf.tenant_conf,
|
||||
)
|
||||
.await?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
debug!("persisting tenantconf to {config_path}");
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
@@ -2772,37 +2638,6 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
async fn persist_tenant_config_legacy(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
target_config_path: &Utf8Path,
|
||||
tenant_conf: &TenantConfOpt,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("persisting tenantconf to {target_config_path}");
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
# It is read in case of pageserver restart.
|
||||
|
||||
[tenant_config]
|
||||
"#
|
||||
.to_string();
|
||||
|
||||
// Convert the config to a toml file.
|
||||
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
|
||||
|
||||
let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
|
||||
|
||||
let tenant_shard_id = *tenant_shard_id;
|
||||
let target_config_path = target_config_path.to_owned();
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("write tenant {tenant_shard_id} config to {target_config_path}")
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// How garbage collection works:
|
||||
//
|
||||
@@ -3021,6 +2856,11 @@ impl Tenant {
|
||||
let now = SystemTime::now();
|
||||
target.leases.retain(|_, lease| !lease.is_expired(&now));
|
||||
|
||||
timeline
|
||||
.metrics
|
||||
.valid_lsn_lease_count_gauge
|
||||
.set(target.leases.len() as u64);
|
||||
|
||||
match gc_cutoffs.remove(&timeline.timeline_id) {
|
||||
Some(cutoffs) => {
|
||||
target.retain_lsns = branchpoints;
|
||||
@@ -3986,7 +3826,7 @@ pub(crate) mod harness {
|
||||
let preload = tenant
|
||||
.preload(&self.remote_storage, CancellationToken::new())
|
||||
.await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
|
||||
tenant.attach(Some(preload), ctx).await?;
|
||||
|
||||
tenant.state.send_replace(TenantState::Active);
|
||||
for timeline in tenant.timelines.lock().unwrap().values() {
|
||||
@@ -4014,7 +3854,7 @@ pub(crate) mod harness {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, walredo::Error> {
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
if records_neon {
|
||||
// For Neon wal records, we can decode without spawning postgres, so do so.
|
||||
@@ -4068,6 +3908,7 @@ mod tests {
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use timeline::GcInfo;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::TenantId;
|
||||
|
||||
@@ -6423,7 +6264,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
@@ -6745,49 +6586,48 @@ mod tests {
|
||||
|
||||
// img layer at 0x10
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
// TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 1@0x20")),
|
||||
Value::Image(Bytes::from("value 1@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::Image(test_img("value 2@0x30")),
|
||||
Value::Image(Bytes::from("value 2@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 3@0x40")),
|
||||
Value::Image(Bytes::from("value 3@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 5@0x20")),
|
||||
Value::Image(Bytes::from("value 5@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 6@0x20")),
|
||||
Value::Image(Bytes::from("value 6@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 8@0x40")),
|
||||
Value::Image(Bytes::from("value 8@0x40")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 9@0x40")),
|
||||
Value::Image(Bytes::from("value 9@0x40")),
|
||||
),
|
||||
];
|
||||
|
||||
@@ -6809,9 +6649,42 @@ mod tests {
|
||||
guard.cutoffs.horizon = Lsn(0x30);
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x20"),
|
||||
Bytes::from_static(b"value 2@0x30"),
|
||||
Bytes::from_static(b"value 3@0x40"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x20"),
|
||||
Bytes::from_static(b"value 6@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x40"),
|
||||
Bytes::from_static(b"value 9@0x40"),
|
||||
];
|
||||
|
||||
for (idx, expected) in expected_result.iter().enumerate() {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
expected
|
||||
);
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
|
||||
for (idx, expected) in expected_result.iter().enumerate() {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
expected
|
||||
);
|
||||
}
|
||||
|
||||
// Check if the image layer at the GC horizon contains exactly what we want
|
||||
let image_at_gc_horizon = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
@@ -6822,14 +6695,22 @@ mod tests {
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(image_at_gc_horizon.len(), 10);
|
||||
let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x20"),
|
||||
Bytes::from_static(b"value 2@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x20"),
|
||||
Bytes::from_static(b"value 6@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
image_at_gc_horizon[idx],
|
||||
(
|
||||
get_key(idx as u32),
|
||||
test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
|
||||
)
|
||||
(get_key(idx as u32), expected_result[idx].clone())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -6862,7 +6743,7 @@ mod tests {
|
||||
},
|
||||
// The delta layer that is cut in the middle
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..get_key(9),
|
||||
key_range: get_key(3)..get_key(4),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||
is_delta: true
|
||||
},
|
||||
@@ -6947,6 +6828,9 @@ mod tests {
|
||||
tline.get(get_key(2), Lsn(0x50), &ctx).await?,
|
||||
Bytes::from_static(b"0x10,0x20,0x30")
|
||||
);
|
||||
|
||||
// Need to remove the limit of "Neon WAL redo requires base image".
|
||||
|
||||
// assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
// assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
|
||||
@@ -7041,4 +6925,174 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
|
||||
//
|
||||
// | D1 | | D3 |
|
||||
// -| |-- gc horizon -----------------
|
||||
// | | | D2 |
|
||||
// --------- img layer ------------------
|
||||
//
|
||||
// What we should expact from this compaction is:
|
||||
// | Part of D1 | | D3 |
|
||||
// --------- img layer with D1+D2 at GC horizon------------------
|
||||
|
||||
// img layer at 0x10
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![delta1, delta2, delta3], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
pitr: Lsn(0x30),
|
||||
horizon: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x40"),
|
||||
Bytes::from_static(b"value 9@0x10@0x40"),
|
||||
];
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,6 +160,7 @@ impl<'a> BlockCursor<'a> {
|
||||
///
|
||||
/// The file is assumed to be immutable. This doesn't provide any functions
|
||||
/// for modifying the file, nor for invalidating the cache if it is modified.
|
||||
#[derive(Clone)]
|
||||
pub struct FileBlockReader<'a> {
|
||||
pub file: &'a VirtualFile,
|
||||
|
||||
|
||||
@@ -281,22 +281,6 @@ impl LocationConf {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LocationConf {
|
||||
// TODO: this should be removed once tenant loading can guarantee that we are never
|
||||
// loading from a directory without a configuration.
|
||||
// => tech debt since https://github.com/neondatabase/neon/issues/1555
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: Generation::none(),
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
tenant_conf: TenantConfOpt::default(),
|
||||
shard: ShardIdentity::unsharded(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A tenant's calcuated configuration, which is the result of merging a
|
||||
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
|
||||
///
|
||||
|
||||
@@ -1,426 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::{models::TenantState, shard::TenantShardId};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, Instrument};
|
||||
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self},
|
||||
tenant::{
|
||||
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
|
||||
};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
Get(#[from] GetTenantError),
|
||||
|
||||
#[error("Tenant map slot error {0}")]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
|
||||
#[error("Tenant map slot upsert error {0}")]
|
||||
SlotUpsertError(#[from] TenantSlotUpsertError),
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
|
||||
|
||||
fn remote_tenant_delete_mark_path(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let tenant_remote_path = conf
|
||||
.tenant_path(tenant_shard_id)
|
||||
.strip_prefix(&conf.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.context("tenant path")?;
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
|
||||
}
|
||||
|
||||
async fn schedule_ordered_timeline_deletions(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
|
||||
// Tenant is stopping at this point. We know it will be deleted.
|
||||
// No new timelines should be created.
|
||||
// Tree sort timelines to delete from leafs to the root.
|
||||
// NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
|
||||
// can complete and remove timeline from the map in between our call to clone
|
||||
// and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
|
||||
// timelines.lock is currently synchronous so we cant hold it across await point.
|
||||
// So just ignore NotFound error if we get it from `run`.
|
||||
// Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
|
||||
let timelines = tenant.timelines.lock().unwrap().clone();
|
||||
let sorted =
|
||||
tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
|
||||
|
||||
let mut already_running_deletions = vec![];
|
||||
|
||||
for (timeline_id, _) in sorted.into_iter().rev() {
|
||||
let span = tracing::info_span!("timeline_delete", %timeline_id);
|
||||
let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
|
||||
.instrument(span)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
match e {
|
||||
DeleteTimelineError::NotFound => {
|
||||
// Timeline deletion finished after call to clone above but before call
|
||||
// to `DeleteTimelineFlow::run` and removed timeline from the map.
|
||||
continue;
|
||||
}
|
||||
DeleteTimelineError::AlreadyInProgress(guard) => {
|
||||
already_running_deletions.push((guard, timeline_id));
|
||||
continue;
|
||||
}
|
||||
e => return Err(DeleteTenantError::Timeline(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(already_running_deletions)
|
||||
}
|
||||
|
||||
async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
|
||||
// Assert timelines dir is empty.
|
||||
if !fs_ext::is_directory_empty(timelines_path).await? {
|
||||
// Display first 10 items in directory
|
||||
let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
|
||||
let list = &list.into_iter().take(10).collect::<Vec<_>>();
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"Timelines directory is not empty after all timelines deletion: {list:?}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_tenant_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path, cancel).await },
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
||||
async fn cleanup_remaining_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
|
||||
if is_dir {
|
||||
tokio::fs::remove_dir(&p).await
|
||||
} else {
|
||||
tokio::fs::remove_file(&p).await
|
||||
}
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.with_context(|| format!("failed to delete {p}"))
|
||||
};
|
||||
|
||||
rm(conf.tenant_config_path(tenant_shard_id), false).await?;
|
||||
rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-timelines-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.timelines_path(tenant_shard_id), true).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-deleted-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let tenant_path = &conf.tenant_path(tenant_shard_id);
|
||||
if tenant_path.exists() {
|
||||
crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
}
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
|
||||
|
||||
rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.tenant_path(tenant_shard_id), true).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
#[default]
|
||||
NotStarted,
|
||||
InProgress,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl DeleteTenantFlow {
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_mark_exists: bool,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
Some(
|
||||
Arc::clone(&t.delete_progress)
|
||||
.try_lock_owned()
|
||||
.expect("we're the only owner during init"),
|
||||
)
|
||||
};
|
||||
|
||||
if remote_mark_exists {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf
|
||||
.tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
|
||||
.exists()
|
||||
{
|
||||
Ok(acquire(tenant))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_attach(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
tenant
|
||||
.set_stopping(progress, false, true)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(preload, super::SpawnMode::Eager, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||
// Note that if deletion fails we dont mark timelines as broken,
|
||||
// the whole tenant will become broken as by `Self::schedule_background` logic
|
||||
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
|
||||
.await
|
||||
.context("schedule_ordered_timeline_deletions")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-polling-ongoing-deletions"
|
||||
))?
|
||||
});
|
||||
|
||||
// Wait for deletions that were already running at the moment when tenant deletion was requested.
|
||||
// When we can lock deletion guard it means that corresponding timeline deletion finished.
|
||||
for (guard, timeline_id) in already_running_timeline_deletions {
|
||||
let flow = guard.lock().await;
|
||||
if !flow.is_finished() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"already running timeline deletion failed: {timeline_id}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Remove top-level tenant objects that don't belong to a timeline, such as heatmap
|
||||
let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
|
||||
if let Some(Err(e)) = backoff::retry(
|
||||
|| async {
|
||||
remote_storage
|
||||
.delete(&heatmap_path, &task_mgr::shutdown_token())
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_remote_tenant_heatmap",
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
|
||||
}
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
|
||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||
if timelines_path.exists() {
|
||||
// sanity check to guard against layout changes
|
||||
ensure_timelines_dir_empty(&timelines_path)
|
||||
.await
|
||||
.context("timelines dir not empty")?;
|
||||
}
|
||||
|
||||
remove_tenant_remote_delete_mark(
|
||||
conf,
|
||||
&remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
|
||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
|
||||
))?
|
||||
});
|
||||
|
||||
cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
{
|
||||
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
|
||||
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
|
||||
//
|
||||
// This complexity will go away when we simplify how deletion works:
|
||||
// https://github.com/neondatabase/neon/issues/5080
|
||||
loop {
|
||||
// Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
|
||||
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
|
||||
let barrier = {
|
||||
let mut locked = tenants.write().unwrap();
|
||||
let removed = locked.remove(tenant.tenant_shard_id);
|
||||
|
||||
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||
|
||||
// Update stats
|
||||
match &removed {
|
||||
TenantsMapRemoveResult::Occupied(slot) => {
|
||||
crate::metrics::TENANT_MANAGER.slot_removed(slot);
|
||||
}
|
||||
TenantsMapRemoveResult::InProgress(barrier) => {
|
||||
crate::metrics::TENANT_MANAGER
|
||||
.slot_removed(&TenantSlot::InProgress(barrier.clone()));
|
||||
}
|
||||
TenantsMapRemoveResult::Vacant => {
|
||||
// Nothing changed in map, no metric update
|
||||
}
|
||||
}
|
||||
|
||||
match removed {
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
|
||||
match tenant.current_state() {
|
||||
TenantState::Stopping { .. } | TenantState::Broken { .. } => {
|
||||
// Expected: we put the tenant into stopping state before we start deleting it
|
||||
}
|
||||
state => {
|
||||
// Unexpected state
|
||||
tracing::warn!(
|
||||
"Tenant in unexpected state {state} after deletion"
|
||||
);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
|
||||
// This is unexpected: this secondary tenants should not have been created, and we
|
||||
// are not in a position to shut it down from here.
|
||||
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
|
||||
unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
|
||||
}
|
||||
TenantsMapRemoveResult::Vacant => {
|
||||
tracing::warn!(
|
||||
"Tenant removed from TenantsMap before deletion completed"
|
||||
);
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::InProgress(barrier) => {
|
||||
// An InProgress entry was found, we must wait on its barrier
|
||||
barrier
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Waiting for competing operation to complete before deleting state for tenant"
|
||||
);
|
||||
barrier.wait().await;
|
||||
}
|
||||
}
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -212,6 +212,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
///
|
||||
/// Public reader object, to search the tree.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct DiskBtreeReader<R, const L: usize>
|
||||
where
|
||||
R: BlockReader,
|
||||
@@ -259,27 +260,38 @@ where
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn iter<'a>(
|
||||
&'a self,
|
||||
start_key: &'a [u8; L],
|
||||
ctx: &'a RequestContext,
|
||||
) -> DiskBtreeIterator<'a> {
|
||||
pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
|
||||
where
|
||||
R: 'a,
|
||||
{
|
||||
DiskBtreeIterator {
|
||||
stream: Box::pin(self.get_stream_from(start_key, ctx)),
|
||||
stream: Box::pin(self.into_stream(start_key, ctx)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a stream which yields all key, value pairs from the index
|
||||
/// starting from the first key greater or equal to `start_key`.
|
||||
///
|
||||
/// Note that this is a copy of [`Self::visit`].
|
||||
/// Note 1: that this is a copy of [`Self::visit`].
|
||||
/// TODO: Once the sequential read path is removed this will become
|
||||
/// the only index traversal method.
|
||||
pub fn get_stream_from<'a>(
|
||||
&'a self,
|
||||
///
|
||||
/// Note 2: this function used to take `&self` but it now consumes `self`. This is due to
|
||||
/// the lifetime constraints of the reader and the stream / iterator it creates. Using `&self`
|
||||
/// requires the reader to be present when the stream is used, and this creates a lifetime
|
||||
/// dependency between the reader and the stream. Now if we want to create an iterator that
|
||||
/// holds the stream, someone will need to keep a reference to the reader, which is inconvenient
|
||||
/// to use from the image/delta layer APIs.
|
||||
///
|
||||
/// Feel free to add the `&self` variant back if it's necessary.
|
||||
pub fn into_stream<'a>(
|
||||
self,
|
||||
start_key: &'a [u8; L],
|
||||
ctx: &'a RequestContext,
|
||||
) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
|
||||
) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a
|
||||
where
|
||||
R: 'a,
|
||||
{
|
||||
try_stream! {
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
|
||||
@@ -51,7 +51,6 @@ use utils::fs_ext::PathExt;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::remote_timeline_client::remote_tenant_path;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||
@@ -109,12 +108,6 @@ pub(crate) enum TenantsMap {
|
||||
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
|
||||
}
|
||||
|
||||
pub(crate) enum TenantsMapRemoveResult {
|
||||
Occupied(TenantSlot),
|
||||
Vacant,
|
||||
InProgress(utils::completion::Barrier),
|
||||
}
|
||||
|
||||
/// When resolving a TenantId to a shard, we may be looking for the 0th
|
||||
/// shard, or we might be looking for whichever shard holds a particular page.
|
||||
#[derive(Copy, Clone)]
|
||||
@@ -191,26 +184,6 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
|
||||
///
|
||||
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
|
||||
/// slot if the enclosed tenant is shutdown.
|
||||
pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
|
||||
use std::collections::btree_map::Entry;
|
||||
match self {
|
||||
TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
|
||||
Entry::Occupied(entry) => match entry.get() {
|
||||
TenantSlot::InProgress(barrier) => {
|
||||
TenantsMapRemoveResult::InProgress(barrier.clone())
|
||||
}
|
||||
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
|
||||
},
|
||||
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(debug_assertions, not(test)))]
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
match self {
|
||||
@@ -460,6 +433,18 @@ async fn init_load_tenant_configs(
|
||||
Ok(configs)
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTenantError {
|
||||
#[error("Tenant map slot error {0}")]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
/// Initialize repositories with locally available timelines.
|
||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
/// are scheduled for download and added to the tenant once download is completed.
|
||||
@@ -510,17 +495,8 @@ pub async fn init_tenant_mgr(
|
||||
let mut location_conf = match location_conf {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}");
|
||||
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Attached(Tenant::create_broken_tenant(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
resources.remote_storage.clone(),
|
||||
format!("{}", e),
|
||||
)),
|
||||
);
|
||||
// This should only happen in the case of a serialization bug or critical local I/O error: we cannot load this tenant
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to load tenant config, failed to {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -629,7 +605,6 @@ pub async fn init_tenant_mgr(
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
@@ -685,7 +660,6 @@ fn tenant_spawn(
|
||||
location_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
@@ -704,24 +678,16 @@ fn tenant_spawn(
|
||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
||||
);
|
||||
|
||||
let remote_storage = resources.remote_storage.clone();
|
||||
let tenant = match Tenant::spawn(
|
||||
let tenant = Tenant::spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
resources,
|
||||
location_conf,
|
||||
shard_identity,
|
||||
init_order,
|
||||
tenants,
|
||||
mode,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
|
||||
}
|
||||
};
|
||||
);
|
||||
|
||||
Ok(tenant)
|
||||
}
|
||||
@@ -1161,7 +1127,6 @@ impl TenantManager {
|
||||
attached_conf,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
spawn_mode,
|
||||
ctx,
|
||||
)?;
|
||||
@@ -1283,7 +1248,6 @@ impl TenantManager {
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
@@ -1634,7 +1598,7 @@ impl TenantManager {
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
let child_shard = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
let locked = self.tenants.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
|
||||
peek_slot.and_then(|s| s.get_attached()).cloned()
|
||||
@@ -1735,6 +1699,7 @@ impl TenantManager {
|
||||
let timelines = parent_shard.timelines.lock().unwrap().clone();
|
||||
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
|
||||
for timeline in timelines.values() {
|
||||
tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
|
||||
let timeline_layers = timeline
|
||||
.layers
|
||||
.read()
|
||||
@@ -1774,7 +1739,12 @@ impl TenantManager {
|
||||
|
||||
// Since we will do a large number of small filesystem metadata operations, batch them into
|
||||
// spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
|
||||
let span = tracing::Span::current();
|
||||
let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
|
||||
// Run this synchronous code in the same log context as the outer function that spawned it.
|
||||
let _span = span.enter();
|
||||
|
||||
tracing::info!("Creating {} directories", create_dirs.len());
|
||||
for dir in &create_dirs {
|
||||
if let Err(e) = std::fs::create_dir_all(dir) {
|
||||
// Ignore AlreadyExists errors, drop out on all other errors
|
||||
@@ -1788,6 +1758,11 @@ impl TenantManager {
|
||||
}
|
||||
|
||||
for child_prefix in child_prefixes {
|
||||
tracing::info!(
|
||||
"Hard-linking {} parent layers into child path {}",
|
||||
parent_layers.len(),
|
||||
child_prefix
|
||||
);
|
||||
for relative_layer in &parent_layers {
|
||||
let parent_path = parent_path.join(relative_layer);
|
||||
let child_path = child_prefix.join(relative_layer);
|
||||
@@ -1813,6 +1788,7 @@ impl TenantManager {
|
||||
// Durability is not required for correctness, but if we crashed during split and
|
||||
// then came restarted with empty timeline dirs, it would be very inefficient to
|
||||
// re-populate from remote storage.
|
||||
tracing::info!("fsyncing {} directories", create_dirs.len());
|
||||
for dir in create_dirs {
|
||||
if let Err(e) = crashsafe::fsync(&dir) {
|
||||
// Something removed a newly created timeline dir out from underneath us? Extremely
|
||||
@@ -1866,7 +1842,7 @@ impl TenantManager {
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let tmp_path = self
|
||||
.detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
|
||||
.detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
|
||||
.await?;
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
@@ -1876,7 +1852,6 @@ impl TenantManager {
|
||||
async fn detach_tenant0(
|
||||
&self,
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &std::sync::RwLock<TenantsMap>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<Utf8PathBuf, TenantStateError> {
|
||||
@@ -1890,7 +1865,7 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
let removal_result = remove_tenant_from_memory(
|
||||
tenants,
|
||||
self.tenants,
|
||||
tenant_shard_id,
|
||||
tenant_dir_rename_operation(tenant_shard_id),
|
||||
)
|
||||
@@ -1906,7 +1881,7 @@ impl TenantManager {
|
||||
pub(crate) fn list_tenants(
|
||||
&self,
|
||||
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let tenants = self.tenants.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
||||
@@ -2007,7 +1982,6 @@ impl TenantManager {
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
@@ -262,6 +262,7 @@ impl scheduler::RunningJob for RunningDownload {
|
||||
struct CompleteDownload {
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
completed_at: Instant,
|
||||
result: Result<(), UpdateError>,
|
||||
}
|
||||
|
||||
impl scheduler::Completion for CompleteDownload {
|
||||
@@ -286,21 +287,33 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
let CompleteDownload {
|
||||
secondary_state,
|
||||
completed_at: _completed_at,
|
||||
result,
|
||||
} = completion;
|
||||
|
||||
tracing::debug!("Secondary tenant download completed");
|
||||
|
||||
let mut detail = secondary_state.detail.lock().unwrap();
|
||||
|
||||
let period = detail
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
match result {
|
||||
Err(UpdateError::Restart) => {
|
||||
// Start downloading again as soon as we can. This will involve waiting for the scheduler's
|
||||
// scheduling interval. This slightly reduces the peak download speed of tenants that hit their
|
||||
// deadline and keep restarting, but that also helps give other tenants a chance to execute rather
|
||||
// that letting one big tenant dominate for a long time.
|
||||
detail.next_download = Some(Instant::now());
|
||||
}
|
||||
_ => {
|
||||
let period = detail
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
|
||||
// We advance next_download irrespective of errors: we don't want error cases to result in
|
||||
// expensive busy-polling.
|
||||
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
|
||||
// We advance next_download irrespective of errors: we don't want error cases to result in
|
||||
// expensive busy-polling.
|
||||
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||
@@ -396,9 +409,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
(RunningDownload { barrier }, Box::pin(async move {
|
||||
let _completion = completion;
|
||||
|
||||
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
|
||||
let result = TenantDownloader::new(conf, &remote_storage, &secondary_state)
|
||||
.download(&download_ctx)
|
||||
.await
|
||||
.await;
|
||||
match &result
|
||||
{
|
||||
Err(UpdateError::NoData) => {
|
||||
tracing::info!("No heatmap found for tenant. This is fine if it is new.");
|
||||
@@ -415,6 +429,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
|
||||
tracing::error!("Error while downloading tenant: {e}");
|
||||
},
|
||||
Err(UpdateError::Restart) => {
|
||||
tracing::info!("Download reached deadline & will restart to update heatmap")
|
||||
}
|
||||
Ok(()) => {}
|
||||
};
|
||||
|
||||
@@ -436,6 +453,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
CompleteDownload {
|
||||
secondary_state,
|
||||
completed_at: Instant::now(),
|
||||
result
|
||||
}
|
||||
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||
}
|
||||
@@ -452,6 +470,11 @@ struct TenantDownloader<'a> {
|
||||
/// Errors that may be encountered while updating a tenant
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum UpdateError {
|
||||
/// This is not a true failure, but it's how a download indicates that it would like to be restarted by
|
||||
/// the scheduler, to pick up the latest heatmap
|
||||
#[error("Reached deadline, restarting downloads")]
|
||||
Restart,
|
||||
|
||||
#[error("No remote data found")]
|
||||
NoData,
|
||||
#[error("Insufficient local storage space")]
|
||||
@@ -603,6 +626,26 @@ impl<'a> TenantDownloader<'a> {
|
||||
self.prepare_timelines(&heatmap, heatmap_mtime).await?;
|
||||
}
|
||||
|
||||
// Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again,
|
||||
// so that we are always using reasonably a fresh heatmap. Otherwise, if we had really huge content to download, we might
|
||||
// spend 10s of minutes downloading layers we don't need.
|
||||
// (see https://github.com/neondatabase/neon/issues/8182)
|
||||
let deadline = {
|
||||
let period = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
|
||||
// Use double the period: we are not promising to complete within the period, this is just a heuristic
|
||||
// to keep using a "reasonably fresh" heatmap.
|
||||
Instant::now() + period * 2
|
||||
};
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
let timeline_state = timeline_states
|
||||
@@ -618,7 +661,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
self.download_timeline(timeline, timeline_state, ctx)
|
||||
self.download_timeline(timeline, timeline_state, deadline, ctx)
|
||||
.instrument(tracing::info_span!(
|
||||
"secondary_download_timeline",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
@@ -827,26 +870,28 @@ impl<'a> TenantDownloader<'a> {
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
async fn download_timeline(
|
||||
/// Download heatmap layers that are not present on local disk, or update their
|
||||
/// access time if they are already present.
|
||||
async fn download_timeline_layers(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline: HeatMapTimeline,
|
||||
timeline_state: SecondaryDetailTimeline,
|
||||
deadline: Instant,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
) -> (Result<(), UpdateError>, Vec<HeatMapLayer>) {
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return Err(UpdateError::Cancelled);
|
||||
return (Err(UpdateError::Cancelled), touched);
|
||||
}
|
||||
|
||||
if Instant::now() > deadline {
|
||||
// We've been running downloads for a while, restart to download latest heatmap.
|
||||
return (Err(UpdateError::Restart), touched);
|
||||
}
|
||||
|
||||
// Existing on-disk layers: just update their access time.
|
||||
@@ -916,20 +961,43 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.await?
|
||||
.await
|
||||
{
|
||||
Some(layer) => touched.push(layer),
|
||||
None => {
|
||||
Ok(Some(layer)) => touched.push(layer),
|
||||
Ok(None) => {
|
||||
// Not an error but we didn't download it: remote layer is missing. Don't add it to the list of
|
||||
// things to consider touched.
|
||||
}
|
||||
Err(e) => {
|
||||
return (Err(e), touched);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write updates to state to record layers we just downloaded or touched.
|
||||
(Ok(()), touched)
|
||||
}
|
||||
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
timeline: HeatMapTimeline,
|
||||
timeline_state: SecondaryDetailTimeline,
|
||||
deadline: Instant,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
let (result, touched) = self
|
||||
.download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
|
||||
.await;
|
||||
|
||||
// Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful
|
||||
{
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
|
||||
let timeline_detail = detail.timelines.entry(timeline_id).or_default();
|
||||
|
||||
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
|
||||
|
||||
@@ -943,14 +1011,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&timeline_id,
|
||||
&t.name,
|
||||
&t.metadata.generation,
|
||||
);
|
||||
e.insert(OnDiskState::new(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&timeline_id,
|
||||
t.name,
|
||||
t.metadata.clone(),
|
||||
t.access_time,
|
||||
@@ -961,7 +1029,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
result
|
||||
}
|
||||
|
||||
/// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
|
||||
|
||||
@@ -367,10 +367,9 @@ async fn upload_tenant_heatmap(
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let generation = tenant.get_generation();
|
||||
debug_assert!(!generation.is_none());
|
||||
if generation.is_none() {
|
||||
// We do not expect this: generations were implemented before heatmap uploads. However,
|
||||
// handle it so that we don't have to make the generation in the heatmap an Option<>
|
||||
// (Generation::none is not serializable)
|
||||
// We do not expect this: None generations should only appear in historic layer metadata, not in running Tenants
|
||||
tracing::warn!("Skipping heatmap upload for tenant with generation==None");
|
||||
return Ok(UploadHeatmapOutcome::Skipped);
|
||||
}
|
||||
|
||||
@@ -928,7 +928,6 @@ impl DeltaLayerInner {
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
@@ -941,7 +940,7 @@ impl DeltaLayerInner {
|
||||
);
|
||||
let mut result = Vec::new();
|
||||
let mut stream =
|
||||
Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
|
||||
Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
@@ -976,7 +975,7 @@ impl DeltaLayerInner {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>>
|
||||
where
|
||||
Reader: BlockReader,
|
||||
Reader: BlockReader + Clone,
|
||||
{
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
@@ -986,7 +985,7 @@ impl DeltaLayerInner {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
|
||||
let index_stream = index_reader.get_stream_from(&start_key.0, &ctx);
|
||||
let index_stream = index_reader.clone().into_stream(&start_key.0, &ctx);
|
||||
let mut index_stream = std::pin::pin!(index_stream);
|
||||
|
||||
while let Some(index_entry) = index_stream.next().await {
|
||||
@@ -1241,7 +1240,7 @@ impl DeltaLayerInner {
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
|
||||
let stream = self.stream_index_forwards(tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
|
||||
let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
|
||||
// put in a sentinel value for getting the end offset for last item, and not having to
|
||||
// repeat the whole read part
|
||||
@@ -1300,7 +1299,7 @@ impl DeltaLayerInner {
|
||||
offsets.start.pos(),
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
max_read_size,
|
||||
Some(max_read_size),
|
||||
))
|
||||
}
|
||||
} else {
|
||||
@@ -1459,17 +1458,17 @@ impl DeltaLayerInner {
|
||||
|
||||
fn stream_index_forwards<'a, R>(
|
||||
&'a self,
|
||||
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||
reader: DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||
start: &'a [u8; DELTA_KEY_SIZE],
|
||||
ctx: &'a RequestContext,
|
||||
) -> impl futures::stream::Stream<
|
||||
Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
|
||||
> + 'a
|
||||
where
|
||||
R: BlockReader,
|
||||
R: BlockReader + 'a,
|
||||
{
|
||||
use futures::stream::TryStreamExt;
|
||||
let stream = reader.get_stream_from(start, ctx);
|
||||
let stream = reader.into_stream(start, ctx);
|
||||
stream.map_ok(|(key, value)| {
|
||||
let key = DeltaKey::from_slice(&key);
|
||||
let (key, lsn) = (key.key(), key.lsn());
|
||||
@@ -1493,6 +1492,24 @@ impl DeltaLayerInner {
|
||||
);
|
||||
offset
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
DeltaLayerIterator {
|
||||
delta_layer: self,
|
||||
ctx,
|
||||
index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
@@ -1552,6 +1569,70 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct DeltaLayerIterator<'a> {
|
||||
delta_layer: &'a DeltaLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> DeltaLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
assert!(!self.is_end);
|
||||
|
||||
let plan = loop {
|
||||
if let Some(res) = self.index_iter.next().await {
|
||||
let (raw_key, value) = res?;
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
let offset = blob_ref.pos();
|
||||
if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
|
||||
break batch_plan;
|
||||
}
|
||||
} else {
|
||||
self.is_end = true;
|
||||
let data_end_offset = self.delta_layer.index_start_offset();
|
||||
break self.planner.handle_range_end(data_end_offset);
|
||||
}
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = Value::des(&frozen_buf[meta.start..meta.end])?;
|
||||
next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.key_values_batch.is_empty() {
|
||||
if self.is_end {
|
||||
return Ok(None);
|
||||
}
|
||||
self.next_batch().await?;
|
||||
}
|
||||
Ok(Some(
|
||||
self.key_values_batch
|
||||
.pop_front()
|
||||
.expect("should not be empty"),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::collections::BTreeMap;
|
||||
@@ -1561,6 +1642,9 @@ mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
@@ -1857,7 +1941,7 @@ mod test {
|
||||
.finish(entries_meta.key_range.end, &timeline, &ctx)
|
||||
.await?;
|
||||
|
||||
let inner = resident.as_delta(&ctx).await?;
|
||||
let inner = resident.get_as_delta(&ctx).await?;
|
||||
|
||||
let file_size = inner.file.metadata().await?.len();
|
||||
tracing::info!(
|
||||
@@ -2044,11 +2128,11 @@ mod test {
|
||||
|
||||
let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
|
||||
|
||||
copied_layer.as_delta(ctx).await.unwrap();
|
||||
copied_layer.get_as_delta(ctx).await.unwrap();
|
||||
|
||||
assert_keys_and_values_eq(
|
||||
new_layer.as_delta(ctx).await.unwrap(),
|
||||
copied_layer.as_delta(ctx).await.unwrap(),
|
||||
new_layer.get_as_delta(ctx).await.unwrap(),
|
||||
copied_layer.get_as_delta(ctx).await.unwrap(),
|
||||
truncate_at,
|
||||
ctx,
|
||||
)
|
||||
@@ -2073,7 +2157,7 @@ mod test {
|
||||
source.index_root_blk,
|
||||
&source_reader,
|
||||
);
|
||||
let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
|
||||
let source_stream = source.stream_index_forwards(source_tree, &start_key, ctx);
|
||||
let source_stream = source_stream.filter(|res| match res {
|
||||
Ok((_, lsn, _)) => ready(lsn < &truncated_at),
|
||||
_ => ready(true),
|
||||
@@ -2086,7 +2170,7 @@ mod test {
|
||||
truncated.index_root_blk,
|
||||
&truncated_reader,
|
||||
);
|
||||
let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
|
||||
let truncated_stream = truncated.stream_index_forwards(truncated_tree, &start_key, ctx);
|
||||
let mut truncated_stream = std::pin::pin!(truncated_stream);
|
||||
|
||||
let mut scratch_left = Vec::new();
|
||||
@@ -2127,4 +2211,116 @@ mod test {
|
||||
assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
|
||||
}
|
||||
}
|
||||
|
||||
async fn produce_delta_layer(
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
mut deltas: Vec<(Key, Lsn, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
|
||||
let (key_start, _, _) = deltas.first().unwrap();
|
||||
let (key_max, _, _) = deltas.first().unwrap();
|
||||
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||
let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
||||
let lsn_end = Lsn(lsn_max.0 + 1);
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
*key_start,
|
||||
(*lsn_min)..lsn_end,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let key_end = key_max.next();
|
||||
|
||||
for (key, lsn, value) in deltas {
|
||||
writer.put_value(key, lsn, value, ctx).await?;
|
||||
}
|
||||
let delta_layer = writer.finish(key_end, tline, ctx).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(delta_layer)
|
||||
}
|
||||
|
||||
async fn assert_delta_iter_equal(
|
||||
delta_iter: &mut DeltaLayerIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = delta_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
assert_eq!(o1.is_some(), o2.is_some());
|
||||
if o1.is_none() && o2.is_none() {
|
||||
break;
|
||||
}
|
||||
let (k1, l1, v1) = o1.unwrap();
|
||||
let (k2, l2, v2) = o2.unwrap();
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, *l2);
|
||||
assert_eq!(&v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_layer_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("delta_layer_iterator").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 1000;
|
||||
let test_deltas = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32 / 10),
|
||||
Lsn(0x10 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
|
||||
for max_read_size in [1, 1024] {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
let mut iter = delta_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut num_items = 0;
|
||||
for _ in 0..3 {
|
||||
iter.next_batch().await.unwrap();
|
||||
num_items += iter.key_values_batch.len();
|
||||
if max_read_size == 1 {
|
||||
// every key should be a batch b/c the value is larger than max_read_size
|
||||
assert_eq!(iter.key_values_batch.len(), 1);
|
||||
} else {
|
||||
assert_eq!(iter.key_values_batch.len(), batch_size);
|
||||
}
|
||||
if num_items >= N {
|
||||
break;
|
||||
}
|
||||
iter.key_values_batch.clear();
|
||||
}
|
||||
// Test if the result is correct
|
||||
let mut iter = delta_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
assert_delta_iter_equal(&mut iter, &test_deltas).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -486,7 +486,6 @@ impl ImageLayerInner {
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
@@ -495,7 +494,7 @@ impl ImageLayerInner {
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
let mut result = Vec::new();
|
||||
let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
|
||||
let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
while let Some(item) = stream.next().await {
|
||||
@@ -544,7 +543,7 @@ impl ImageLayerInner {
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
|
||||
let index_stream = tree_reader.clone().into_stream(&search_key, &ctx);
|
||||
let mut index_stream = std::pin::pin!(index_stream);
|
||||
|
||||
while let Some(index_entry) = index_stream.next().await {
|
||||
@@ -689,6 +688,24 @@ impl ImageLayerInner {
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
ImageLayerIterator {
|
||||
image_layer: self,
|
||||
ctx,
|
||||
index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
@@ -943,11 +960,77 @@ impl Drop for ImageLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct ImageLayerIterator<'a> {
|
||||
image_layer: &'a ImageLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> ImageLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
assert!(!self.is_end);
|
||||
|
||||
let plan = loop {
|
||||
if let Some(res) = self.index_iter.next().await {
|
||||
let (raw_key, offset) = res?;
|
||||
if let Some(batch_plan) = self.planner.handle(
|
||||
Key::from_slice(&raw_key[..KEY_SIZE]),
|
||||
self.image_layer.lsn,
|
||||
offset,
|
||||
BlobFlag::None,
|
||||
) {
|
||||
break batch_plan;
|
||||
}
|
||||
} else {
|
||||
self.is_end = true;
|
||||
let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
break self.planner.handle_range_end(payload_end);
|
||||
}
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf: Bytes = blobs_buf.buf.freeze();
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.key_values_batch.is_empty() {
|
||||
if self.is_end {
|
||||
return Ok(None);
|
||||
}
|
||||
self.next_batch().await?;
|
||||
}
|
||||
Ok(Some(
|
||||
self.key_values_batch
|
||||
.pop_front()
|
||||
.expect("should not be empty"),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::time::Duration;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use bytes::Bytes;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
@@ -959,11 +1042,19 @@ mod test {
|
||||
};
|
||||
|
||||
use crate::{
|
||||
tenant::{config::TenantConf, harness::TenantHarness},
|
||||
context::RequestContext,
|
||||
repository::Value,
|
||||
tenant::{
|
||||
config::TenantConf,
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::ResidentLayer,
|
||||
vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
Tenant, Timeline,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
use super::ImageLayerWriter;
|
||||
use super::{ImageLayerIterator, ImageLayerWriter};
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_rewrite() {
|
||||
@@ -1134,4 +1225,111 @@ mod test {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn produce_image_layer(
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
mut images: Vec<(Key, Bytes)>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
images.sort();
|
||||
let (key_start, _) = images.first().unwrap();
|
||||
let (key_last, _) = images.last().unwrap();
|
||||
let key_end = key_last.next();
|
||||
let key_range = *key_start..key_end;
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
&key_range,
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for (key, img) in images {
|
||||
writer.put_image(key, img, ctx).await?;
|
||||
}
|
||||
let img_layer = writer.finish(tline, ctx).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(img_layer)
|
||||
}
|
||||
|
||||
async fn assert_img_iter_equal(
|
||||
img_iter: &mut ImageLayerIterator<'_>,
|
||||
expect: &[(Key, Bytes)],
|
||||
expect_lsn: Lsn,
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = img_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
match (o1, o2) {
|
||||
(None, None) => break,
|
||||
(Some((k1, l1, v1)), Some((k2, i2))) => {
|
||||
let Value::Image(i1) = v1 else {
|
||||
panic!("expect Value::Image")
|
||||
};
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, expect_lsn);
|
||||
assert_eq!(&i1, i2);
|
||||
}
|
||||
(o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_iterator() {
|
||||
let harness = TenantHarness::create("image_layer_iterator").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 1000;
|
||||
let test_imgs = (0..N)
|
||||
.map(|idx| (get_key(idx as u32), Bytes::from(format!("img{idx:05}"))))
|
||||
.collect_vec();
|
||||
let resident_layer =
|
||||
produce_image_layer(&tenant, &tline, test_imgs.clone(), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
|
||||
for max_read_size in [1, 1024] {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
let mut iter = img_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut num_items = 0;
|
||||
for _ in 0..3 {
|
||||
iter.next_batch().await.unwrap();
|
||||
num_items += iter.key_values_batch.len();
|
||||
if max_read_size == 1 {
|
||||
// every key should be a batch b/c the value is larger than max_read_size
|
||||
assert_eq!(iter.key_values_batch.len(), 1);
|
||||
} else {
|
||||
assert_eq!(iter.key_values_batch.len(), batch_size);
|
||||
}
|
||||
if num_items >= N {
|
||||
break;
|
||||
}
|
||||
iter.key_values_batch.clear();
|
||||
}
|
||||
// Test if the result is correct
|
||||
let mut iter = img_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -622,18 +622,16 @@ impl InMemoryLayer {
|
||||
|
||||
let end_lsn = *self.end_lsn.get().unwrap();
|
||||
|
||||
let keys: Vec<_> = if let Some(key_range) = key_range {
|
||||
let key_count = if let Some(key_range) = key_range {
|
||||
inner
|
||||
.index
|
||||
.iter()
|
||||
.filter(|(k, _)| key_range.contains(k))
|
||||
.map(|(k, m)| (k.to_i128(), m))
|
||||
.collect()
|
||||
.count()
|
||||
} else {
|
||||
inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
|
||||
inner.index.len()
|
||||
};
|
||||
|
||||
if keys.is_empty() {
|
||||
if key_count == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
|
||||
@@ -93,16 +93,12 @@ pub(crate) struct Layer(Arc<LayerInner>);
|
||||
|
||||
impl std::fmt::Display for Layer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if matches!(self.0.generation, Generation::Broken) {
|
||||
write!(f, "{}-broken", self.layer_desc().short_id())
|
||||
} else {
|
||||
write!(
|
||||
f,
|
||||
"{}{}",
|
||||
self.layer_desc().short_id(),
|
||||
self.0.generation.get_suffix()
|
||||
)
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
"{}{}",
|
||||
self.layer_desc().short_id(),
|
||||
self.0.generation.get_suffix()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,7 +385,6 @@ impl Layer {
|
||||
}
|
||||
|
||||
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
@@ -1774,7 +1769,6 @@ impl DownloadedLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn load_key_values(
|
||||
&self,
|
||||
owner: &Arc<LayerInner>,
|
||||
@@ -1905,7 +1899,7 @@ impl ResidentLayer {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn as_delta(
|
||||
pub(crate) async fn get_as_delta(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
|
||||
@@ -1915,6 +1909,18 @@ impl ResidentLayer {
|
||||
Image(_) => Err(anyhow::anyhow!("image layer")),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn get_as_image(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<&image_layer::ImageLayerInner> {
|
||||
use LayerKind::*;
|
||||
match self.downloaded.get(&self.owner.0, ctx).await? {
|
||||
Image(ref d) => Ok(d),
|
||||
Delta(_) => Err(anyhow::anyhow!("delta layer")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ResidentLayer {
|
||||
|
||||
@@ -686,6 +686,7 @@ pub enum GetLogicalSizePriority {
|
||||
pub(crate) enum CompactFlags {
|
||||
ForceRepartition,
|
||||
ForceImageLayerCreation,
|
||||
EnhancedGcBottomMostCompaction,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Timeline {
|
||||
@@ -1096,7 +1097,6 @@ impl Timeline {
|
||||
/// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
|
||||
/// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
|
||||
/// the scan operation will not cause OOM in the future.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn scan(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -5481,12 +5481,12 @@ impl Timeline {
|
||||
}
|
||||
images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
|
||||
let min_key = *images.first().map(|(k, _)| k).unwrap();
|
||||
let max_key = images.last().map(|(k, _)| k).unwrap().next();
|
||||
let end_key = images.last().map(|(k, _)| k).unwrap().next();
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(min_key..max_key),
|
||||
&(min_key..end_key),
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
@@ -5518,7 +5518,7 @@ impl Timeline {
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
|
||||
let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
|
||||
let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
|
||||
let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
|
||||
let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||
let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
||||
assert!(
|
||||
@@ -5541,7 +5541,7 @@ impl Timeline {
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
|
||||
let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;
|
||||
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
@@ -47,10 +47,14 @@ impl Timeline {
|
||||
/// TODO: cancellation
|
||||
pub(crate) async fn compact_legacy(
|
||||
self: &Arc<Self>,
|
||||
_cancel: &CancellationToken,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
|
||||
return self.compact_with_gc(cancel, ctx).await;
|
||||
}
|
||||
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
// 1. First, calculate the desired "partitioning" of the
|
||||
@@ -959,13 +963,20 @@ impl Timeline {
|
||||
/// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
|
||||
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
_cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
use crate::tenant::storage_layer::ValueReconstructState;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
info!("running enhanced gc bottom-most compaction");
|
||||
|
||||
scopeguard::defer! {
|
||||
info!("done enhanced gc bottom-most compaction");
|
||||
};
|
||||
|
||||
// Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
@@ -974,6 +985,11 @@ impl Timeline {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"enhanced legacy compaction currently does not support retain_lsns (branches)"
|
||||
)));
|
||||
}
|
||||
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
|
||||
let mut selected_layers = Vec::new();
|
||||
// TODO: consider retain_lsns
|
||||
@@ -985,21 +1001,36 @@ impl Timeline {
|
||||
}
|
||||
(selected_layers, gc_cutoff)
|
||||
};
|
||||
info!(
|
||||
"picked {} layers for compaction with gc_cutoff={}",
|
||||
layer_selection.len(),
|
||||
gc_cutoff
|
||||
);
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, collect the layer information to decide when to split the new delta layers.
|
||||
let mut all_key_values = Vec::new();
|
||||
let mut delta_split_points = BTreeSet::new();
|
||||
for layer in &layer_selection {
|
||||
all_key_values.extend(layer.load_key_values(ctx).await?);
|
||||
let desc = layer.layer_desc();
|
||||
if desc.is_delta() {
|
||||
// TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
|
||||
// so that we can avoid having too many small delta layers.
|
||||
let key_range = desc.get_key_range();
|
||||
delta_split_points.insert(key_range.start);
|
||||
delta_split_points.insert(key_range.end);
|
||||
}
|
||||
}
|
||||
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
|
||||
// image layers, make image appear later than delta.
|
||||
// image layers, make image appear before than delta.
|
||||
struct ValueWrapper<'a>(&'a crate::repository::Value);
|
||||
impl Ord for ValueWrapper<'_> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
use crate::repository::Value;
|
||||
use std::cmp::Ordering;
|
||||
match (self.0, other.0) {
|
||||
(Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
|
||||
(Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
|
||||
(Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
|
||||
(Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
|
||||
_ => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
@@ -1018,13 +1049,6 @@ impl Timeline {
|
||||
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
|
||||
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
|
||||
});
|
||||
let max_lsn = all_key_values
|
||||
.iter()
|
||||
.map(|(_, lsn, _)| lsn)
|
||||
.max()
|
||||
.copied()
|
||||
.unwrap()
|
||||
+ 1;
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
@@ -1043,14 +1067,24 @@ impl Timeline {
|
||||
// We have a list of deltas/images. We want to create image layers while collect garbages.
|
||||
for (key, lsn, val) in accumulated_values.iter().rev() {
|
||||
if *lsn > horizon {
|
||||
keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
|
||||
if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() {
|
||||
if *prev_lsn == *lsn {
|
||||
// The case that we have an LSN with both data from the delta layer and the image layer. As
|
||||
// `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
|
||||
// drop this delta and keep the image.
|
||||
//
|
||||
// For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
|
||||
// keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
|
||||
// dropped.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
keys_above_horizon.push((*key, *lsn, val.clone()));
|
||||
} else if *lsn <= horizon {
|
||||
match val {
|
||||
crate::repository::Value::Image(image) => {
|
||||
if lsn <= &horizon {
|
||||
base_image = Some((*lsn, image.clone()));
|
||||
break;
|
||||
}
|
||||
base_image = Some((*lsn, image.clone()));
|
||||
break;
|
||||
}
|
||||
crate::repository::Value::WalRecord(wal) => {
|
||||
delta_above_base_image.push((*lsn, wal.clone()));
|
||||
@@ -1058,7 +1092,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
delta_above_base_image.reverse();
|
||||
// do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
|
||||
keys_above_horizon.reverse();
|
||||
let state = ValueReconstructState {
|
||||
img: base_image,
|
||||
@@ -1068,15 +1102,59 @@ impl Timeline {
|
||||
Ok((keys_above_horizon, img))
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
all_key_values.first().unwrap().0,
|
||||
gc_cutoff..max_lsn, // TODO: off by one?
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
async fn flush_deltas(
|
||||
deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
|
||||
last_key: Key,
|
||||
delta_split_points: &[Key],
|
||||
current_delta_split_point: &mut usize,
|
||||
tline: &Arc<Timeline>,
|
||||
gc_cutoff: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<ResidentLayer>> {
|
||||
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
|
||||
// overlapping layers.
|
||||
//
|
||||
// If we have a structure like this:
|
||||
//
|
||||
// | Delta 1 | | Delta 4 |
|
||||
// |---------| Delta 2 |---------|
|
||||
// | Delta 3 | | Delta 5 |
|
||||
//
|
||||
// And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
|
||||
// A simple solution here is to split the delta layers using the original boundary, while this
|
||||
// might produce a lot of small layers. This should be improved and fixed in the future.
|
||||
let mut need_split = false;
|
||||
while *current_delta_split_point < delta_split_points.len()
|
||||
&& last_key >= delta_split_points[*current_delta_split_point]
|
||||
{
|
||||
*current_delta_split_point += 1;
|
||||
need_split = true;
|
||||
}
|
||||
if !need_split {
|
||||
return Ok(None);
|
||||
}
|
||||
let deltas = std::mem::take(deltas);
|
||||
if deltas.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
tline.conf,
|
||||
tline.timeline_id,
|
||||
tline.tenant_shard_id,
|
||||
deltas.first().unwrap().0,
|
||||
gc_cutoff..end_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let key_end = deltas.last().unwrap().0.next();
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
let delta_layer = delta_layer_writer.finish(key_end, tline, ctx).await?;
|
||||
Ok(Some(delta_layer))
|
||||
}
|
||||
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -1087,6 +1165,10 @@ impl Timeline {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut delta_values = Vec::new();
|
||||
let delta_split_points = delta_split_points.into_iter().collect_vec();
|
||||
let mut current_delta_split_point = 0;
|
||||
let mut delta_layers = Vec::new();
|
||||
for item @ (key, _, _) in &all_key_values {
|
||||
if &last_key == key {
|
||||
accumulated_values.push(item);
|
||||
@@ -1094,34 +1176,63 @@ impl Timeline {
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
delta_values.extend(deltas);
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
accumulated_values.clear();
|
||||
accumulated_values.push(item);
|
||||
last_key = *key;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: move this part to the loop body
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
accumulated_values.clear();
|
||||
// TODO: split layers
|
||||
let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
|
||||
delta_values.extend(deltas);
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
info!(
|
||||
"produced {} delta layers and {} image layers",
|
||||
delta_layers.len(),
|
||||
1
|
||||
);
|
||||
let mut compact_to = Vec::new();
|
||||
compact_to.extend(delta_layers);
|
||||
compact_to.push(image_layer);
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.finish_gc_compaction(
|
||||
&layer_selection,
|
||||
&[delta_layer.clone(), image_layer.clone()],
|
||||
&self.metrics,
|
||||
)
|
||||
guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
|
||||
};
|
||||
|
||||
self.remote_client
|
||||
.schedule_compaction_update(&layer_selection, &compact_to)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -255,7 +255,6 @@ impl DeleteTimelineFlow {
|
||||
}
|
||||
|
||||
/// Shortcut to create Timeline in stopping state and spawn deletion task.
|
||||
/// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
|
||||
#[instrument(skip_all, fields(%timeline_id))]
|
||||
pub async fn resume_deletion(
|
||||
tenant: Arc<Tenant>,
|
||||
@@ -420,10 +419,6 @@ impl DeleteTimelineFlow {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn is_finished(&self) -> bool {
|
||||
matches!(self, Self::Finished)
|
||||
}
|
||||
|
||||
pub(crate) fn is_not_started(&self) -> bool {
|
||||
matches!(self, Self::NotStarted)
|
||||
}
|
||||
|
||||
@@ -227,7 +227,6 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called when a GC-compaction is completed.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn finish_gc_compaction(
|
||||
&mut self,
|
||||
compact_from: &[Layer],
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::num::NonZeroUsize;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::vec_map::VecMap;
|
||||
|
||||
@@ -77,7 +78,7 @@ pub(crate) struct VectoredReadBuilder {
|
||||
start: u64,
|
||||
end: u64,
|
||||
blobs_at: VecMap<u64, BlobMeta>,
|
||||
max_read_size: usize,
|
||||
max_read_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl VectoredReadBuilder {
|
||||
@@ -90,7 +91,7 @@ impl VectoredReadBuilder {
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
max_read_size: Option<usize>,
|
||||
) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
@@ -111,7 +112,13 @@ impl VectoredReadBuilder {
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
tracing::trace!(start, end, "trying to extend");
|
||||
let size = (end - start) as usize;
|
||||
if self.end == start && self.size() + size <= self.max_read_size {
|
||||
if self.end == start && {
|
||||
if let Some(max_read_size) = self.max_read_size {
|
||||
self.size() + size <= max_read_size
|
||||
} else {
|
||||
true
|
||||
}
|
||||
} {
|
||||
self.end = end;
|
||||
self.blobs_at
|
||||
.append(start, meta)
|
||||
@@ -157,7 +164,7 @@ pub struct VectoredReadPlanner {
|
||||
// Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
|
||||
prev: Option<(Key, Lsn, u64, BlobFlag)>,
|
||||
|
||||
max_read_size: usize,
|
||||
max_read_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl VectoredReadPlanner {
|
||||
@@ -165,7 +172,20 @@ impl VectoredReadPlanner {
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size,
|
||||
max_read_size: Some(max_read_size),
|
||||
}
|
||||
}
|
||||
|
||||
/// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
|
||||
/// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
|
||||
/// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
|
||||
/// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn new_caller_controlled_max_limit() -> Self {
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,8 +317,9 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
);
|
||||
let buf = self
|
||||
.file
|
||||
.read_exact_at_n(buf, read.start, read.size(), ctx)
|
||||
.await?;
|
||||
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
let blobs_at = read.blobs_at.as_slice();
|
||||
let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
|
||||
@@ -354,6 +375,87 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
|
||||
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
|
||||
/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
|
||||
#[cfg(test)]
|
||||
pub struct StreamingVectoredReadPlanner {
|
||||
planner: VectoredReadPlanner,
|
||||
/// Max read size per batch
|
||||
max_read_size: u64,
|
||||
/// Max item count per batch
|
||||
max_cnt: usize,
|
||||
/// The first offset of this batch
|
||||
this_batch_first_offset: Option<u64>,
|
||||
/// Size of the current batch
|
||||
cnt: usize,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl StreamingVectoredReadPlanner {
|
||||
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
|
||||
assert!(max_cnt > 0);
|
||||
assert!(max_read_size > 0);
|
||||
Self {
|
||||
// We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
|
||||
// Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
|
||||
// to avoid splitting into two I/Os.
|
||||
planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
|
||||
max_cnt,
|
||||
max_read_size,
|
||||
this_batch_first_offset: None,
|
||||
cnt: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
|
||||
let planner = std::mem::replace(
|
||||
&mut self.planner,
|
||||
VectoredReadPlanner::new_caller_controlled_max_limit(),
|
||||
);
|
||||
self.this_batch_first_offset = Some(this_batch_first_offset);
|
||||
self.cnt = 1;
|
||||
let mut batch = planner.finish();
|
||||
assert_eq!(batch.len(), 1, "should have exactly one read batch");
|
||||
batch.pop().unwrap()
|
||||
}
|
||||
|
||||
pub fn handle(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
offset: u64,
|
||||
flag: BlobFlag,
|
||||
) -> Option<VectoredRead> {
|
||||
if let Some(begin_offset) = self.this_batch_first_offset {
|
||||
// Each batch will have at least one item b/c `self.this_batch_first_offset` is set
|
||||
// after one item gets processed
|
||||
if offset - begin_offset > self.max_read_size {
|
||||
self.planner.handle_range_end(offset); // End the current batch with the offset
|
||||
let batch = self.emit(offset); // Produce a batch
|
||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
|
||||
return Some(batch);
|
||||
}
|
||||
} else {
|
||||
self.this_batch_first_offset = Some(offset)
|
||||
}
|
||||
if self.cnt >= self.max_cnt {
|
||||
self.planner.handle_range_end(offset); // End the current batch with the offset
|
||||
let batch = self.emit(offset); // Produce a batch
|
||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
|
||||
return Some(batch);
|
||||
}
|
||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
|
||||
self.cnt += 1;
|
||||
None
|
||||
}
|
||||
|
||||
pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
|
||||
self.planner.handle_range_end(offset);
|
||||
self.emit(offset)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
|
||||
|
||||
use crate::page_cache::PageWriteGuard;
|
||||
use crate::page_cache::{PageWriteGuard, PAGE_SZ};
|
||||
use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -48,6 +48,7 @@ pub(crate) mod owned_buffers_io {
|
||||
//! but for the time being we're proving out the primitives in the neon.git repo
|
||||
//! for faster iteration.
|
||||
|
||||
pub(crate) mod slice;
|
||||
pub(crate) mod write;
|
||||
pub(crate) mod util {
|
||||
pub(crate) mod size_tracking_writer;
|
||||
@@ -143,16 +144,17 @@ struct SlotInner {
|
||||
/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
|
||||
struct PageWriteGuardBuf {
|
||||
page: PageWriteGuard<'static>,
|
||||
init_up_to: usize,
|
||||
}
|
||||
// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
|
||||
// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
|
||||
// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good.
|
||||
// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.)
|
||||
unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.page.as_ptr()
|
||||
}
|
||||
fn bytes_init(&self) -> usize {
|
||||
self.init_up_to
|
||||
self.page.len()
|
||||
}
|
||||
fn bytes_total(&self) -> usize {
|
||||
self.page.len()
|
||||
@@ -166,8 +168,8 @@ unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
|
||||
}
|
||||
|
||||
unsafe fn set_init(&mut self, pos: usize) {
|
||||
// There shouldn't really be any reason to call this API since bytes_init() == bytes_total().
|
||||
assert!(pos <= self.page.len());
|
||||
self.init_up_to = pos;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -585,37 +587,37 @@ impl VirtualFile {
|
||||
Ok(self.pos)
|
||||
}
|
||||
|
||||
pub async fn read_exact_at<B>(
|
||||
/// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`.
|
||||
///
|
||||
/// The returned `Slice<Buf>` is equivalent to the input `slice`, i.e., it's the same view into the same buffer.
|
||||
pub async fn read_exact_at<Buf>(
|
||||
&self,
|
||||
buf: B,
|
||||
slice: Slice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<B, Error>
|
||||
) -> Result<Slice<Buf>, Error>
|
||||
where
|
||||
B: IoBufMut + Send,
|
||||
Buf: IoBufMut + Send,
|
||||
{
|
||||
let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
|
||||
self.read_at(buf, offset, ctx)
|
||||
})
|
||||
.await;
|
||||
res.map(|()| buf)
|
||||
}
|
||||
let assert_we_return_original_bounds = if cfg!(debug_assertions) {
|
||||
Some((slice.stable_ptr() as usize, slice.bytes_total()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
pub async fn read_exact_at_n<B>(
|
||||
&self,
|
||||
buf: B,
|
||||
offset: u64,
|
||||
count: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<B, Error>
|
||||
where
|
||||
B: IoBufMut + Send,
|
||||
{
|
||||
let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
|
||||
self.read_at(buf, offset, ctx)
|
||||
})
|
||||
.await;
|
||||
res.map(|()| buf)
|
||||
let original_bounds = slice.bounds();
|
||||
let (buf, res) =
|
||||
read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await;
|
||||
let res = res.map(|_| buf.slice(original_bounds));
|
||||
|
||||
if let Some(original_bounds) = assert_we_return_original_bounds {
|
||||
if let Ok(slice) = &res {
|
||||
let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total());
|
||||
assert_eq!(original_bounds, returned_bounds);
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
|
||||
@@ -625,13 +627,11 @@ impl VirtualFile {
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PageWriteGuard<'static>, Error> {
|
||||
let buf = PageWriteGuardBuf {
|
||||
page,
|
||||
init_up_to: 0,
|
||||
};
|
||||
let res = self.read_exact_at(buf, offset, ctx).await;
|
||||
res.map(|PageWriteGuardBuf { page, .. }| page)
|
||||
.map_err(|e| Error::new(ErrorKind::Other, e))
|
||||
let buf = PageWriteGuardBuf { page }.slice_full();
|
||||
debug_assert_eq!(buf.bytes_total(), PAGE_SZ);
|
||||
self.read_exact_at(buf, offset, ctx)
|
||||
.await
|
||||
.map(|slice| slice.into_inner().page)
|
||||
}
|
||||
|
||||
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
|
||||
@@ -722,14 +722,14 @@ impl VirtualFile {
|
||||
(buf, Ok(n))
|
||||
}
|
||||
|
||||
pub(crate) async fn read_at<B>(
|
||||
pub(crate) async fn read_at<Buf>(
|
||||
&self,
|
||||
buf: B,
|
||||
buf: tokio_epoll_uring::Slice<Buf>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (B, Result<usize, Error>)
|
||||
) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
|
||||
where
|
||||
B: tokio_epoll_uring::BoundedBufMut + Send,
|
||||
Buf: tokio_epoll_uring::IoBufMut + Send,
|
||||
{
|
||||
let file_guard = match self.lock_file().await {
|
||||
Ok(file_guard) => file_guard,
|
||||
@@ -781,26 +781,16 @@ impl VirtualFile {
|
||||
}
|
||||
|
||||
// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
|
||||
pub async fn read_exact_at_impl<B, F, Fut>(
|
||||
buf: B,
|
||||
pub async fn read_exact_at_impl<Buf, F, Fut>(
|
||||
mut buf: tokio_epoll_uring::Slice<Buf>,
|
||||
mut offset: u64,
|
||||
count: Option<usize>,
|
||||
mut read_at: F,
|
||||
) -> (B, std::io::Result<()>)
|
||||
) -> (Buf, std::io::Result<()>)
|
||||
where
|
||||
B: IoBufMut + Send,
|
||||
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
|
||||
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
|
||||
Buf: IoBufMut + Send,
|
||||
F: FnMut(tokio_epoll_uring::Slice<Buf>, u64) -> Fut,
|
||||
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<Buf>, std::io::Result<usize>)>,
|
||||
{
|
||||
let mut buf: tokio_epoll_uring::Slice<B> = match count {
|
||||
Some(count) => {
|
||||
assert!(count <= buf.bytes_total());
|
||||
assert!(count > 0);
|
||||
buf.slice(..count) // may include uninitialized memory
|
||||
}
|
||||
None => buf.slice_full(), // includes all the uninitialized memory
|
||||
};
|
||||
|
||||
while buf.bytes_total() != 0 {
|
||||
let res;
|
||||
(buf, res) = read_at(buf, offset).await;
|
||||
@@ -882,7 +872,7 @@ mod test_read_exact_at_impl {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic() {
|
||||
let buf = Vec::with_capacity(5);
|
||||
let buf = Vec::with_capacity(5).slice_full();
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![Expectation {
|
||||
offset: 0,
|
||||
@@ -890,7 +880,7 @@ mod test_read_exact_at_impl {
|
||||
result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
|
||||
}]),
|
||||
}));
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
@@ -899,33 +889,13 @@ mod test_read_exact_at_impl {
|
||||
assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_with_count() {
|
||||
let buf = Vec::with_capacity(5);
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![Expectation {
|
||||
offset: 0,
|
||||
bytes_total: 3,
|
||||
result: Ok(vec![b'a', b'b', b'c']),
|
||||
}]),
|
||||
}));
|
||||
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
.await;
|
||||
assert!(res.is_ok());
|
||||
assert_eq!(buf, vec![b'a', b'b', b'c']);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_buf_issues_no_syscall() {
|
||||
let buf = Vec::new();
|
||||
let buf = Vec::new().slice_full();
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::new(),
|
||||
}));
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
@@ -935,7 +905,7 @@ mod test_read_exact_at_impl {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_two_read_at_calls_needed_until_buf_filled() {
|
||||
let buf = Vec::with_capacity(4);
|
||||
let buf = Vec::with_capacity(4).slice_full();
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![
|
||||
Expectation {
|
||||
@@ -950,7 +920,7 @@ mod test_read_exact_at_impl {
|
||||
},
|
||||
]),
|
||||
}));
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
@@ -961,7 +931,7 @@ mod test_read_exact_at_impl {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_eof_before_buffer_full() {
|
||||
let buf = Vec::with_capacity(3);
|
||||
let buf = Vec::with_capacity(3).slice_full();
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![
|
||||
Expectation {
|
||||
@@ -981,7 +951,7 @@ mod test_read_exact_at_impl {
|
||||
},
|
||||
]),
|
||||
}));
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
@@ -1051,27 +1021,29 @@ impl VirtualFile {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
let buf = vec![0; PAGE_SZ];
|
||||
let buf = self
|
||||
.read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
|
||||
let slice = Vec::with_capacity(PAGE_SZ).slice_full();
|
||||
assert_eq!(slice.bytes_total(), PAGE_SZ);
|
||||
let slice = self
|
||||
.read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
|
||||
.await?;
|
||||
Ok(crate::tenant::block_io::BlockLease::Vec(buf))
|
||||
Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
|
||||
}
|
||||
|
||||
async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
|
||||
let mut tmp = vec![0; 128];
|
||||
loop {
|
||||
let res;
|
||||
(tmp, res) = self.read_at(tmp, self.pos, ctx).await;
|
||||
let slice = tmp.slice(..128);
|
||||
let (slice, res) = self.read_at(slice, self.pos, ctx).await;
|
||||
match res {
|
||||
Ok(0) => return Ok(()),
|
||||
Ok(n) => {
|
||||
self.pos += n as u64;
|
||||
buf.extend_from_slice(&tmp[..n]);
|
||||
buf.extend_from_slice(&slice[..n]);
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
tmp = slice.into_inner();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1185,6 +1157,7 @@ mod tests {
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
use super::*;
|
||||
use owned_buffers_io::slice::SliceExt;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
@@ -1206,13 +1179,16 @@ mod tests {
|
||||
impl MaybeVirtualFile {
|
||||
async fn read_exact_at(
|
||||
&self,
|
||||
mut buf: Vec<u8>,
|
||||
mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<u8>, Error> {
|
||||
) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
|
||||
MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
|
||||
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
|
||||
MaybeVirtualFile::File(file) => {
|
||||
let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed();
|
||||
file.read_exact_at(rust_slice, offset).map(|()| slice)
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
@@ -1286,9 +1262,12 @@ mod tests {
|
||||
len: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<String, Error> {
|
||||
let buf = vec![0; len];
|
||||
let buf = self.read_exact_at(buf, pos, ctx).await?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
let slice = Vec::with_capacity(len).slice_full();
|
||||
assert_eq!(slice.bytes_total(), len);
|
||||
let slice = self.read_exact_at(slice, pos, ctx).await?;
|
||||
let vec = slice.into_inner();
|
||||
assert_eq!(vec.len(), len);
|
||||
Ok(String::from_utf8(vec).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1507,7 +1486,11 @@ mod tests {
|
||||
let mut rng = rand::rngs::OsRng;
|
||||
for _ in 1..1000 {
|
||||
let f = &files[rng.gen_range(0..files.len())];
|
||||
buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
|
||||
buf = f
|
||||
.read_exact_at(buf.slice_full(), 0, &ctx)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
assert!(buf == SAMPLE);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -107,7 +107,7 @@ use std::{
|
||||
sync::atomic::{AtomicU8, Ordering},
|
||||
};
|
||||
|
||||
use super::{FileGuard, Metadata};
|
||||
use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata};
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
|
||||
@@ -120,38 +120,29 @@ fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std:
|
||||
}
|
||||
|
||||
impl IoEngine {
|
||||
pub(super) async fn read_at<B>(
|
||||
pub(super) async fn read_at<Buf>(
|
||||
&self,
|
||||
file_guard: FileGuard,
|
||||
offset: u64,
|
||||
mut buf: B,
|
||||
) -> ((FileGuard, B), std::io::Result<usize>)
|
||||
mut slice: tokio_epoll_uring::Slice<Buf>,
|
||||
) -> (
|
||||
(FileGuard, tokio_epoll_uring::Slice<Buf>),
|
||||
std::io::Result<usize>,
|
||||
)
|
||||
where
|
||||
B: tokio_epoll_uring::BoundedBufMut + Send,
|
||||
Buf: tokio_epoll_uring::IoBufMut + Send,
|
||||
{
|
||||
match self {
|
||||
IoEngine::NotSet => panic!("not initialized"),
|
||||
IoEngine::StdFs => {
|
||||
// SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
|
||||
let dst = unsafe {
|
||||
std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
|
||||
};
|
||||
let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
|
||||
if let Ok(nbytes) = &res {
|
||||
assert!(*nbytes <= buf.bytes_total());
|
||||
// SAFETY: see above assertion
|
||||
unsafe {
|
||||
buf.set_init(*nbytes);
|
||||
}
|
||||
}
|
||||
#[allow(dropping_references)]
|
||||
drop(dst);
|
||||
((file_guard, buf), res)
|
||||
let rust_slice = slice.as_mut_rust_slice_full_zeroed();
|
||||
let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset));
|
||||
((file_guard, slice), res)
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.read(file_guard, offset, buf).await;
|
||||
let (resources, res) = system.read(file_guard, offset, slice).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
}
|
||||
|
||||
121
pageserver/src/virtual_file/owned_buffers_io/slice.rs
Normal file
121
pageserver/src/virtual_file/owned_buffers_io/slice.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use tokio_epoll_uring::BoundedBufMut;
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tokio_epoll_uring::Slice;
|
||||
|
||||
pub(crate) trait SliceExt {
|
||||
/// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
|
||||
///
|
||||
/// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]`
|
||||
fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8];
|
||||
}
|
||||
|
||||
impl<B> SliceExt for Slice<B>
|
||||
where
|
||||
B: IoBufMut,
|
||||
{
|
||||
#[inline(always)]
|
||||
fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] {
|
||||
// zero-initialize the uninitialized parts of the buffer so we can create a Rust slice
|
||||
//
|
||||
// SAFETY: we own `slice`, don't write outside the bounds
|
||||
unsafe {
|
||||
let to_init = self.bytes_total() - self.bytes_init();
|
||||
self.stable_mut_ptr()
|
||||
.add(self.bytes_init())
|
||||
.write_bytes(0, to_init);
|
||||
self.set_init(self.bytes_total());
|
||||
};
|
||||
let bytes_total = self.bytes_total();
|
||||
&mut self[0..bytes_total]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::Read;
|
||||
|
||||
use super::*;
|
||||
use bytes::Buf;
|
||||
use tokio_epoll_uring::Slice;
|
||||
|
||||
#[test]
|
||||
fn test_slice_full_zeroed() {
|
||||
let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader();
|
||||
|
||||
// before we start the test, let's make sure we have a shared understanding of what slice_full does
|
||||
{
|
||||
let buf = Vec::with_capacity(3);
|
||||
let slice: Slice<_> = buf.slice_full();
|
||||
assert_eq!(slice.bytes_init(), 0);
|
||||
assert_eq!(slice.bytes_total(), 3);
|
||||
let rust_slice = &slice[..];
|
||||
assert_eq!(
|
||||
rust_slice.len(),
|
||||
0,
|
||||
"Slice only derefs to a &[u8] of the initialized part"
|
||||
);
|
||||
}
|
||||
|
||||
// and also let's establish a shared understanding of .slice()
|
||||
{
|
||||
let buf = Vec::with_capacity(3);
|
||||
let slice: Slice<_> = buf.slice(0..2);
|
||||
assert_eq!(slice.bytes_init(), 0);
|
||||
assert_eq!(slice.bytes_total(), 2);
|
||||
let rust_slice = &slice[..];
|
||||
assert_eq!(
|
||||
rust_slice.len(),
|
||||
0,
|
||||
"Slice only derefs to a &[u8] of the initialized part"
|
||||
);
|
||||
}
|
||||
|
||||
// the above leads to the easy mistake of using slice[..] for borrow-based IO like so:
|
||||
{
|
||||
let buf = Vec::with_capacity(3);
|
||||
let mut slice: Slice<_> = buf.slice_full();
|
||||
assert_eq!(slice[..].len(), 0);
|
||||
let mut file = make_fake_file();
|
||||
file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0
|
||||
assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]);
|
||||
}
|
||||
|
||||
// With owned buffers IO like with VirtualFilem, you could totally
|
||||
// pass in a `Slice` with bytes_init()=0 but bytes_total()=5
|
||||
// and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5.
|
||||
{
|
||||
// TODO: demo
|
||||
}
|
||||
|
||||
//
|
||||
// Ok, now that we have a shared understanding let's demo how to use the extension trait.
|
||||
//
|
||||
|
||||
// slice_full()
|
||||
{
|
||||
let buf = Vec::with_capacity(3);
|
||||
let mut slice: Slice<_> = buf.slice_full();
|
||||
let rust_slice = slice.as_mut_rust_slice_full_zeroed();
|
||||
assert_eq!(rust_slice.len(), 3);
|
||||
assert_eq!(rust_slice, &[0, 0, 0]);
|
||||
let mut file = make_fake_file();
|
||||
file.read_exact(rust_slice).unwrap();
|
||||
assert_eq!(rust_slice, b"123");
|
||||
assert_eq!(&slice[..], b"123");
|
||||
}
|
||||
|
||||
// .slice(..)
|
||||
{
|
||||
let buf = Vec::with_capacity(3);
|
||||
let mut slice: Slice<_> = buf.slice(0..2);
|
||||
let rust_slice = slice.as_mut_rust_slice_full_zeroed();
|
||||
assert_eq!(rust_slice.len(), 2);
|
||||
assert_eq!(rust_slice, &[0, 0]);
|
||||
let mut file = make_fake_file();
|
||||
file.read_exact(rust_slice).unwrap();
|
||||
assert_eq!(rust_slice, b"12");
|
||||
assert_eq!(&slice[..], b"12");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -343,7 +343,33 @@ impl WalIngest {
|
||||
xlog_checkpoint.oldestActiveXid,
|
||||
self.checkpoint.oldestActiveXid
|
||||
);
|
||||
self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
|
||||
|
||||
// A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
|
||||
// because at shutdown, all in-progress transactions will implicitly
|
||||
// end. Postgres startup code knows that, and allows hot standby to start
|
||||
// immediately from a shutdown checkpoint.
|
||||
//
|
||||
// In Neon, Postgres hot standby startup always behaves as if starting from
|
||||
// an online checkpoint. It needs a valid `oldestActiveXid` value, so
|
||||
// instead of overwriting self.checkpoint.oldestActiveXid with
|
||||
// InvalidTransactionid from the checkpoint WAL record, update it to a
|
||||
// proper value, knowing that there are no in-progress transactions at this
|
||||
// point, except for prepared transactions.
|
||||
//
|
||||
// See also the neon code changes in the InitWalRecovery() function.
|
||||
if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
|
||||
&& info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
|
||||
for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
|
||||
if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
|
||||
oldest_active_xid = xid;
|
||||
}
|
||||
}
|
||||
self.checkpoint.oldestActiveXid = oldest_active_xid;
|
||||
} else {
|
||||
self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
|
||||
}
|
||||
|
||||
// Write a new checkpoint key-value pair on every checkpoint record, even
|
||||
// if nothing really changed. Not strictly required, but it seems nice to
|
||||
@@ -375,6 +401,7 @@ impl WalIngest {
|
||||
if info == pg_constants::XLOG_RUNNING_XACTS {
|
||||
let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
|
||||
self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
pg_constants::RM_REPLORIGIN_ID => {
|
||||
@@ -1277,13 +1304,10 @@ impl WalIngest {
|
||||
xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
|
||||
);
|
||||
|
||||
// Here we treat oldestXid and oldestXidDB
|
||||
// differently from postgres redo routines.
|
||||
// In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid
|
||||
// until checkpoint happens and updates the value.
|
||||
// Here we can use the most recent value.
|
||||
// It's just an optimization, though and can be deleted.
|
||||
// TODO Figure out if there will be any issues with replica.
|
||||
// In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
|
||||
// truncated, but a checkpoint record with the updated values isn't written until
|
||||
// later. In Neon, a server can start at any LSN, not just on a checkpoint record,
|
||||
// so we keep the oldestXid and oldestXidDB up-to-date.
|
||||
self.checkpoint.oldestXid = xlrec.oldest_xid;
|
||||
self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||
self.checkpoint_modified = true;
|
||||
@@ -1384,14 +1408,31 @@ impl WalIngest {
|
||||
// Note: The multixact members can wrap around, even within one WAL record.
|
||||
offset = offset.wrapping_add(n_this_page as u32);
|
||||
}
|
||||
if xlrec.mid >= self.checkpoint.nextMulti {
|
||||
self.checkpoint.nextMulti = xlrec.mid + 1;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > self.checkpoint.nextMultiOffset {
|
||||
self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
let next_offset = offset;
|
||||
assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset);
|
||||
|
||||
// Update next-multi-xid and next-offset
|
||||
//
|
||||
// NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to
|
||||
// go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that
|
||||
// read it, like GetNewMultiXactId(). This is different from how nextXid is
|
||||
// incremented! nextXid skips over < FirstNormalTransactionId when the the value
|
||||
// is stored, so it's never 0 in a checkpoint.
|
||||
//
|
||||
// I don't know why it's done that way, it seems less error-prone to skip over 0
|
||||
// when the value is stored rather than when it's read. But let's do it the same
|
||||
// way here.
|
||||
let next_multi_xid = xlrec.mid.wrapping_add(1);
|
||||
|
||||
if self
|
||||
.checkpoint
|
||||
.update_next_multixid(next_multi_xid, next_offset)
|
||||
{
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
|
||||
// Also update the next-xid with the highest member. According to the comments in
|
||||
// multixact_redo(), this shouldn't be necessary, but let's do the same here.
|
||||
let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
|
||||
if let Some(max_xid) = acc {
|
||||
if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
|
||||
|
||||
@@ -40,6 +40,7 @@ use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
///
|
||||
@@ -53,10 +54,18 @@ pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
|
||||
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
|
||||
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// We use [`heavier_once_cell`] for
|
||||
///
|
||||
/// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`])
|
||||
/// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]).
|
||||
///
|
||||
/// # Spawning
|
||||
///
|
||||
/// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`].
|
||||
///
|
||||
/// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// their process object; we use [`Arc::clone`] for that.
|
||||
///
|
||||
/// This is primarily because earlier implementations that didn't use [`heavier_once_cell`]
|
||||
/// had that behavior; it's probably unnecessary.
|
||||
/// The only merit of it is that if one walredo process encounters an error,
|
||||
@@ -65,7 +74,63 @@ pub struct PostgresRedoManager {
|
||||
/// still be using the old redo process. But, those other tasks will most likely
|
||||
/// encounter an error as well, and errors are an unexpected condition anyway.
|
||||
/// So, probably we could get rid of the `Arc` in the future.
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
|
||||
///
|
||||
/// # Shutdown
|
||||
///
|
||||
/// See [`Self::launched_processes`].
|
||||
redo_process: heavier_once_cell::OnceCell<ProcessOnceCell>,
|
||||
|
||||
/// Gate that is entered when launching a walredo process and held open
|
||||
/// until the process has been `kill()`ed and `wait()`ed upon.
|
||||
///
|
||||
/// Manager shutdown waits for this gate to close after setting the
|
||||
/// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`].
|
||||
///
|
||||
/// This type of usage is a bit unusual because gates usually keep track of
|
||||
/// concurrent operations, e.g., every [`Self::request_redo`] that is inflight.
|
||||
/// But we use it here to keep track of the _processes_ that we have launched,
|
||||
/// which may outlive any individual redo request because
|
||||
/// - we keep walredo process around until its quiesced to amortize spawn cost and
|
||||
/// - the Arc may be held by multiple concurrent redo requests, so, just because
|
||||
/// you replace the [`Self::redo_process`] cell's content doesn't mean the
|
||||
/// process gets killed immediately.
|
||||
///
|
||||
/// We could simplify this by getting rid of the [`Arc`].
|
||||
/// See the comment on [`Self::redo_process`] for more details.
|
||||
launched_processes: utils::sync::gate::Gate,
|
||||
}
|
||||
|
||||
/// See [`PostgresRedoManager::redo_process`].
|
||||
enum ProcessOnceCell {
|
||||
Spawned(Arc<Process>),
|
||||
ManagerShutDown,
|
||||
}
|
||||
|
||||
struct Process {
|
||||
_launched_processes_guard: utils::sync::gate::GateGuard,
|
||||
process: process::WalRedoProcess,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Process {
|
||||
type Target = process::WalRedoProcess;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.process
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
macro_rules! bail {
|
||||
($($arg:tt)*) => {
|
||||
return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*)));
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -88,9 +153,9 @@ impl PostgresRedoManager {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, Error> {
|
||||
if records.is_empty() {
|
||||
anyhow::bail!("invalid WAL redo request with no records");
|
||||
bail!("invalid WAL redo request with no records");
|
||||
}
|
||||
|
||||
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
|
||||
@@ -148,10 +213,10 @@ impl PostgresRedoManager {
|
||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||
})
|
||||
},
|
||||
process: self
|
||||
.redo_process
|
||||
.get()
|
||||
.map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
|
||||
process: self.redo_process.get().and_then(|p| match &*p {
|
||||
ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }),
|
||||
ProcessOnceCell::ManagerShutDown => None,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -170,9 +235,39 @@ impl PostgresRedoManager {
|
||||
conf,
|
||||
last_redo_at: std::sync::Mutex::default(),
|
||||
redo_process: heavier_once_cell::OnceCell::default(),
|
||||
launched_processes: utils::sync::gate::Gate::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shut down the WAL redo manager.
|
||||
///
|
||||
/// After this future completes
|
||||
/// - no redo process is running
|
||||
/// - no new redo process will be spawned
|
||||
/// - redo requests that need walredo process will fail with [`Error::Cancelled`]
|
||||
/// - [`apply_neon`]-only redo requests may still work, but this may change in the future
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn shutdown(&self) {
|
||||
// prevent new processes from being spawned
|
||||
let permit = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
permit
|
||||
}
|
||||
Err(permit) => permit,
|
||||
};
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
||||
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
||||
// for the underlying process.
|
||||
self.launched_processes.close().await;
|
||||
}
|
||||
|
||||
/// This type doesn't have its own background task to check for idleness: we
|
||||
/// rely on our owner calling this function periodically in its own housekeeping
|
||||
/// loops.
|
||||
@@ -203,38 +298,48 @@ impl PostgresRedoManager {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, Error> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let proc: Arc<process::WalRedoProcess> =
|
||||
match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::WalRedoProcess::launch(
|
||||
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => match &*guard {
|
||||
ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
|
||||
ProcessOnceCell::ManagerShutDown => {
|
||||
return Err(Error::Cancelled);
|
||||
}
|
||||
},
|
||||
Err(permit) => {
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(Process {
|
||||
_launched_processes_guard: match self.launched_processes.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(GateError::GateClosed) => unreachable!(
|
||||
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
|
||||
),
|
||||
},
|
||||
process: process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
});
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
@@ -299,12 +404,17 @@ impl PostgresRedoManager {
|
||||
match self.redo_process.get() {
|
||||
None => (),
|
||||
Some(guard) => {
|
||||
if Arc::ptr_eq(&proc, &*guard) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
guard.take_and_deinit();
|
||||
} else {
|
||||
// Another task already spawned another redo process (further up in this method)
|
||||
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||
match &*guard {
|
||||
ProcessOnceCell::ManagerShutDown => {}
|
||||
ProcessOnceCell::Spawned(guard_proc) => {
|
||||
if Arc::ptr_eq(&proc, guard_proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
guard.take_and_deinit();
|
||||
} else {
|
||||
// Another task already spawned another redo process (further up in this method)
|
||||
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -315,7 +425,7 @@ impl PostgresRedoManager {
|
||||
}
|
||||
n_attempts += 1;
|
||||
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
|
||||
return result;
|
||||
return result.map_err(Error::Other);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -329,7 +439,7 @@ impl PostgresRedoManager {
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, Error> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut page = BytesMut::new();
|
||||
@@ -338,7 +448,7 @@ impl PostgresRedoManager {
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
// All the current WAL record types that we can handle require a base image.
|
||||
anyhow::bail!("invalid neon WAL redo request with no base image");
|
||||
bail!("invalid neon WAL redo request with no base image");
|
||||
}
|
||||
|
||||
// Apply all the WAL records in the batch
|
||||
|
||||
303
pgxn/neon/neon.c
303
pgxn/neon/neon.c
@@ -12,6 +12,8 @@
|
||||
#include "fmgr.h"
|
||||
|
||||
#include "miscadmin.h"
|
||||
#include "access/subtrans.h"
|
||||
#include "access/twophase.h"
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "storage/buf_internals.h"
|
||||
@@ -22,10 +24,12 @@
|
||||
#include "replication/logical.h"
|
||||
#include "replication/slot.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/proc.h"
|
||||
#include "storage/procsignal.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
#include "funcapi.h"
|
||||
#include "access/htup_details.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
#include "utils/guc.h"
|
||||
#include "utils/wait_event.h"
|
||||
@@ -41,7 +45,6 @@ PG_MODULE_MAGIC;
|
||||
void _PG_init(void);
|
||||
|
||||
static int logical_replication_max_snap_files = 300;
|
||||
bool primary_is_running = false;
|
||||
|
||||
static void
|
||||
InitLogicalReplicationMonitor(void)
|
||||
@@ -267,6 +270,293 @@ LogicalSlotsMonitorMain(Datum main_arg)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX: These private to procarray.c, but we need them here.
|
||||
*/
|
||||
#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts)
|
||||
#define TOTAL_MAX_CACHED_SUBXIDS \
|
||||
((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
|
||||
|
||||
/*
|
||||
* Restore running-xact information by scanning the CLOG at startup.
|
||||
*
|
||||
* In PostgreSQL, a standby always has to wait for a running-xacts WAL record
|
||||
* to arrive before it can start accepting queries. Furthermore, if there are
|
||||
* transactions with too many subxids (> 64) open to fit in the in-memory
|
||||
* subxids cache, the running-xacts record will be marked as "suboverflowed",
|
||||
* and the standby will need to also wait for the currently in-progress
|
||||
* transactions to finish.
|
||||
*
|
||||
* That's not great in PostgreSQL, because a hot standby does not necessary
|
||||
* open up for queries immediately as you might expect. But it's worse in
|
||||
* Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint
|
||||
* record; it can start at any LSN. Postgres arranges things so that there is
|
||||
* a running-xacts record soon after every checkpoint record, but when you
|
||||
* start from an arbitrary LSN, that doesn't help. If the primary is idle, or
|
||||
* not running at all, it might never write a new running-xacts record,
|
||||
* leaving the replica in a limbo where it can never start accepting queries.
|
||||
*
|
||||
* To mitigate that, we have an additional mechanism to find the running-xacts
|
||||
* information: we scan the CLOG, making note of any XIDs not marked as
|
||||
* committed or aborted. They are added to the Postgres known-assigned XIDs
|
||||
* array by calling ProcArrayApplyRecoveryInfo() in the caller of this
|
||||
* function.
|
||||
*
|
||||
* There is one big limitation with that mechanism: The size of the
|
||||
* known-assigned XIDs is limited, so if there are a lot of in-progress XIDs,
|
||||
* we have to give up. Furthermore, we don't know how many of the in-progress
|
||||
* XIDs are subtransactions, and if we use up all the space in the
|
||||
* known-assigned XIDs array for subtransactions, we might run out of space in
|
||||
* the array later during WAL replay, causing the replica to shut down with
|
||||
* "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to
|
||||
* the known-assigned array without risking that error later is very low,
|
||||
* merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up
|
||||
* to half of the known-assigned XIDs array for the subtransactions, even
|
||||
* though that risks getting the error later.
|
||||
*
|
||||
* Note: It's OK if the recovered list of XIDs includes some transactions that
|
||||
* have crashed in the primary, and hence will never commit. They will be seen
|
||||
* as in-progress, until we see a new next running-acts record with an
|
||||
* oldestActiveXid that invalidates them. That's how the known-assigned XIDs
|
||||
* array always works.
|
||||
*
|
||||
* If scraping the CLOG doesn't succeed for some reason, like the subxid
|
||||
* overflow, Postgres will fall back to waiting for a running-xacts record
|
||||
* like usual.
|
||||
*
|
||||
* Returns true if a complete list of in-progress XIDs was scraped.
|
||||
*/
|
||||
static bool
|
||||
RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids)
|
||||
{
|
||||
TransactionId from;
|
||||
TransactionId till;
|
||||
int max_xcnt;
|
||||
TransactionId *prepared_xids = NULL;
|
||||
int n_prepared_xids;
|
||||
TransactionId *restored_xids = NULL;
|
||||
int n_restored_xids;
|
||||
int next_prepared_idx;
|
||||
|
||||
Assert(*xids == NULL);
|
||||
|
||||
/*
|
||||
* If the checkpoint doesn't have a valid oldestActiveXid, bail out. We
|
||||
* don't know where to start the scan.
|
||||
*
|
||||
* This shouldn't happen, because the pageserver always maintains a valid
|
||||
* oldestActiveXid nowadays. Except when starting at an old point in time
|
||||
* that was ingested before the pageserver was taught to do that.
|
||||
*/
|
||||
if (!TransactionIdIsValid(checkpoint->oldestActiveXid))
|
||||
{
|
||||
elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/*
|
||||
* We will scan the CLOG starting from the oldest active XID.
|
||||
*
|
||||
* In some corner cases, the oldestActiveXid from the last checkpoint
|
||||
* might already have been truncated from the CLOG. That is,
|
||||
* oldestActiveXid might be older than oldestXid. That's possible because
|
||||
* oldestActiveXid is only updated at checkpoints. After the last
|
||||
* checkpoint, the oldest transaction might have committed, and the CLOG
|
||||
* might also have been already truncated. So if oldestActiveXid is older
|
||||
* than oldestXid, start at oldestXid instead. (Otherwise we'd try to
|
||||
* access CLOG segments that have already been truncated away.)
|
||||
*/
|
||||
from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid)
|
||||
? checkpoint->oldestActiveXid : checkpoint->oldestXid;
|
||||
till = XidFromFullTransactionId(checkpoint->nextXid);
|
||||
|
||||
/*
|
||||
* To avoid "too many KnownAssignedXids" error later during replay, we
|
||||
* limit number of collected transactions. This is a tradeoff: if we are
|
||||
* willing to consume more of the KnownAssignedXids space for the XIDs
|
||||
* now, that allows us to start up, but we might run out of space later.
|
||||
*
|
||||
* The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS,
|
||||
* which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In
|
||||
* PostgreSQL, that's always enough because the primary will always write
|
||||
* an XLOG_XACT_ASSIGNMENT record if a transaction has more than
|
||||
* PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows
|
||||
* the standby to mark the XIDs in pg_subtrans and removing them from the
|
||||
* KnowingAssignedXids array.
|
||||
*
|
||||
* Here, we don't know which XIDs belong to subtransactions that have
|
||||
* already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we
|
||||
* wanted to be totally safe and avoid the possibility of getting a "too
|
||||
* many KnownAssignedXids" error later, we would have to limit ourselves
|
||||
* to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top
|
||||
* transaction IDs too, because we cannot distinguish between top
|
||||
* transaction IDs and subtransactions here.
|
||||
*
|
||||
* Somewhat arbitrarily, we use up to half of KnownAssignedXids. That
|
||||
* strikes a sensible balance between being useful, and risking a "too
|
||||
* many KnownAssignedXids" error later.
|
||||
*/
|
||||
max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2;
|
||||
|
||||
/*
|
||||
* Collect XIDs of prepared transactions in an array. This includes only
|
||||
* their top-level XIDs. We assume that StandbyRecoverPreparedTransactions
|
||||
* has already been called, so we can find all the sub-transactions in
|
||||
* pg_subtrans.
|
||||
*/
|
||||
PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids);
|
||||
qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator);
|
||||
|
||||
/*
|
||||
* Scan the CLOG, collecting in-progress XIDs into 'restored_xids'.
|
||||
*/
|
||||
elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till);
|
||||
restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
|
||||
n_restored_xids = 0;
|
||||
next_prepared_idx = 0;
|
||||
for (TransactionId xid = from; xid != till;)
|
||||
{
|
||||
XLogRecPtr xidlsn;
|
||||
XidStatus xidstatus;
|
||||
|
||||
xidstatus = TransactionIdGetStatus(xid, &xidlsn);
|
||||
|
||||
/*
|
||||
* "Merge" the prepared transactions into the restored_xids array as
|
||||
* we go. The prepared transactions array is sorted. This is mostly
|
||||
* a sanity check to ensure that all the prpeared transactions are
|
||||
* seen as in-progress. (There is a check after the loop that we didn't
|
||||
* miss any.)
|
||||
*/
|
||||
if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx])
|
||||
{
|
||||
/*
|
||||
* This is a top-level transaction ID of a prepared transaction.
|
||||
* Include it in the array.
|
||||
*/
|
||||
|
||||
/* sanity check */
|
||||
if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS)
|
||||
{
|
||||
elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG",
|
||||
xid, xidstatus);
|
||||
Assert(false);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids);
|
||||
next_prepared_idx++;
|
||||
}
|
||||
else if (xidstatus == TRANSACTION_STATUS_COMMITTED)
|
||||
{
|
||||
elog(DEBUG1, "XID %u: was committed", xid);
|
||||
goto skip;
|
||||
}
|
||||
else if (xidstatus == TRANSACTION_STATUS_ABORTED)
|
||||
{
|
||||
elog(DEBUG1, "XID %u: was aborted", xid);
|
||||
goto skip;
|
||||
}
|
||||
else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS)
|
||||
{
|
||||
/*
|
||||
* In-progress transactions are included in the array.
|
||||
*
|
||||
* Except subtransactions of the prepared transactions. They are
|
||||
* already set in pg_subtrans, and hence don't need to be tracked
|
||||
* in the known-assigned XIDs array.
|
||||
*/
|
||||
if (n_prepared_xids > 0)
|
||||
{
|
||||
TransactionId parent = SubTransGetParent(xid);
|
||||
|
||||
if (TransactionIdIsValid(parent))
|
||||
{
|
||||
/*
|
||||
* This is a subtransaction belonging to a prepared
|
||||
* transaction.
|
||||
*
|
||||
* Sanity check that it is in the prepared XIDs array. It
|
||||
* should be, because StandbyRecoverPreparedTransactions
|
||||
* populated pg_subtrans, and no other XID should be set
|
||||
* in it yet. (This also relies on the fact that
|
||||
* StandbyRecoverPreparedTransactions sets the parent of
|
||||
* each subxid to point directly to the top-level XID,
|
||||
* rather than restoring the original subtransaction
|
||||
* hierarchy.)
|
||||
*/
|
||||
if (bsearch(&parent, prepared_xids, next_prepared_idx,
|
||||
sizeof(TransactionId), xidLogicalComparator) == NULL)
|
||||
{
|
||||
elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG",
|
||||
xid, parent);
|
||||
Assert(false);
|
||||
goto fail;
|
||||
}
|
||||
elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent);
|
||||
goto skip;
|
||||
}
|
||||
}
|
||||
|
||||
/* include it in the array */
|
||||
elog(DEBUG1, "XID %u: is in progress", xid);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* SUB_COMMITTED is a transient state used at commit. We don't
|
||||
* expect to see that here.
|
||||
*/
|
||||
elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG",
|
||||
xid, xidstatus);
|
||||
Assert(false);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (n_restored_xids >= max_xcnt)
|
||||
{
|
||||
/*
|
||||
* Overflowed. We won't be able to install the RunningTransactions
|
||||
* snapshot.
|
||||
*/
|
||||
elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
|
||||
checkpoint->oldestXid, checkpoint->oldestActiveXid,
|
||||
XidFromFullTransactionId(checkpoint->nextXid));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
restored_xids[n_restored_xids++] = xid;
|
||||
|
||||
skip:
|
||||
TransactionIdAdvance(xid);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* sanity check */
|
||||
if (next_prepared_idx != n_prepared_xids)
|
||||
{
|
||||
elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG",
|
||||
prepared_xids[next_prepared_idx]);
|
||||
Assert(false);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
|
||||
n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
|
||||
*nxids = n_restored_xids;
|
||||
*xids = restored_xids;
|
||||
return true;
|
||||
|
||||
fail:
|
||||
*nxids = 0;
|
||||
*xids = NULL;
|
||||
if (restored_xids)
|
||||
pfree(restored_xids);
|
||||
if (prepared_xids)
|
||||
pfree(prepared_xids);
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
_PG_init(void)
|
||||
{
|
||||
@@ -289,15 +579,8 @@ _PG_init(void)
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.primary_is_running",
|
||||
"true if the primary was running at replica startup. false otherwise",
|
||||
NULL,
|
||||
&primary_is_running,
|
||||
false,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
restore_running_xacts_callback = RestoreRunningXactsFromClog;
|
||||
|
||||
/*
|
||||
* Important: This must happen after other parts of the extension are
|
||||
* loaded, otherwise any settings to GUCs that were set before the
|
||||
|
||||
@@ -1447,7 +1447,7 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
* core as this is kinda expected scenario.
|
||||
*/
|
||||
disable_core_dump();
|
||||
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
@@ -63,6 +63,8 @@ char *wal_acceptors_list = "";
|
||||
int wal_acceptor_reconnect_timeout = 1000;
|
||||
int wal_acceptor_connection_timeout = 10000;
|
||||
|
||||
/* Set to true in the walproposer bgw. */
|
||||
static bool am_walproposer;
|
||||
static WalproposerShmemState *walprop_shared;
|
||||
static WalProposerConfig walprop_config;
|
||||
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
|
||||
@@ -76,6 +78,7 @@ static HotStandbyFeedback agg_hs_feedback;
|
||||
|
||||
static void nwp_shmem_startup_hook(void);
|
||||
static void nwp_register_gucs(void);
|
||||
static void assign_neon_safekeepers(const char *newval, void *extra);
|
||||
static void nwp_prepare_shmem(void);
|
||||
static uint64 backpressure_lag_impl(void);
|
||||
static bool backpressure_throttling_impl(void);
|
||||
@@ -111,7 +114,8 @@ init_walprop_config(bool syncSafekeepers)
|
||||
{
|
||||
walprop_config.neon_tenant = neon_tenant;
|
||||
walprop_config.neon_timeline = neon_timeline;
|
||||
walprop_config.safekeepers_list = wal_acceptors_list;
|
||||
/* WalProposerCreate scribbles directly on it, so pstrdup */
|
||||
walprop_config.safekeepers_list = pstrdup(wal_acceptors_list);
|
||||
walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout;
|
||||
walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout;
|
||||
walprop_config.wal_segment_size = wal_segment_size;
|
||||
@@ -151,6 +155,7 @@ WalProposerMain(Datum main_arg)
|
||||
|
||||
init_walprop_config(false);
|
||||
walprop_pg_init_bgworker();
|
||||
am_walproposer = true;
|
||||
walprop_pg_load_libpqwalreceiver();
|
||||
|
||||
wp = WalProposerCreate(&walprop_config, walprop_pg);
|
||||
@@ -189,10 +194,10 @@ nwp_register_gucs(void)
|
||||
NULL, /* long_desc */
|
||||
&wal_acceptors_list, /* valueAddr */
|
||||
"", /* bootValue */
|
||||
PGC_POSTMASTER,
|
||||
PGC_SIGHUP,
|
||||
GUC_LIST_INPUT, /* extensions can't use*
|
||||
* GUC_LIST_QUOTE */
|
||||
NULL, NULL, NULL);
|
||||
NULL, assign_neon_safekeepers, NULL);
|
||||
|
||||
DefineCustomIntVariable(
|
||||
"neon.safekeeper_reconnect_timeout",
|
||||
@@ -215,6 +220,33 @@ nwp_register_gucs(void)
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
|
||||
* the list changed.
|
||||
*/
|
||||
static void
|
||||
assign_neon_safekeepers(const char *newval, void *extra)
|
||||
{
|
||||
if (!am_walproposer)
|
||||
return;
|
||||
|
||||
if (!newval) {
|
||||
/* should never happen */
|
||||
wpg_log(FATAL, "neon.safekeepers is empty");
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit and
|
||||
* thus remove this delay.
|
||||
*/
|
||||
if (strcmp(wal_acceptors_list, newval) != 0)
|
||||
{
|
||||
wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
|
||||
wal_acceptors_list, newval);
|
||||
}
|
||||
}
|
||||
|
||||
/* Check if we need to suspend inserts because of lagging replication. */
|
||||
static uint64
|
||||
backpressure_lag_impl(void)
|
||||
@@ -363,7 +395,7 @@ walprop_register_bgworker(void)
|
||||
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
|
||||
snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
|
||||
snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
|
||||
bgw.bgw_restart_time = 5;
|
||||
bgw.bgw_restart_time = 1;
|
||||
bgw.bgw_notify_pid = 0;
|
||||
bgw.bgw_main_arg = (Datum) 0;
|
||||
|
||||
@@ -1639,6 +1671,18 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
|
||||
late_cv_trigger = ConditionVariableCancelSleep();
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Process config if requested. This restarts walproposer if safekeepers
|
||||
* list changed. Don't do that for sync-safekeepers because quite probably
|
||||
* it (re-reading config) won't work without some effort, and
|
||||
* sync-safekeepers should be quick to finish anyway.
|
||||
*/
|
||||
if (!wp->config->syncSafekeepers && ConfigReloadPending)
|
||||
{
|
||||
ConfigReloadPending = false;
|
||||
ProcessConfigFile(PGC_SIGHUP);
|
||||
}
|
||||
|
||||
/*
|
||||
* If wait is terminated by latch set (walsenders' latch is set on each
|
||||
* wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH)
|
||||
|
||||
@@ -7,7 +7,7 @@ OBJS = \
|
||||
neontest.o
|
||||
|
||||
EXTENSION = neon_test_utils
|
||||
DATA = neon_test_utils--1.1.sql
|
||||
DATA = neon_test_utils--1.2.sql
|
||||
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
|
||||
@@ -41,7 +41,7 @@ RETURNS bytea
|
||||
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
|
||||
LANGUAGE C PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
|
||||
CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'neon_xlogflush'
|
||||
LANGUAGE C PARALLEL UNSAFE;
|
||||
@@ -1,6 +1,6 @@
|
||||
# neon_test_utils extension
|
||||
comment = 'helpers for neon testing and debugging'
|
||||
default_version = '1.1'
|
||||
default_version = '1.2'
|
||||
module_pathname = '$libdir/neon_test_utils'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "access/relation.h"
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "catalog/namespace.h"
|
||||
#include "fmgr.h"
|
||||
#include "funcapi.h"
|
||||
@@ -444,11 +445,46 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* Directly calls XLogFlush(lsn) to flush WAL buffers.
|
||||
*
|
||||
* If 'lsn' is not specified (is NULL), flush all generated WAL.
|
||||
*/
|
||||
Datum
|
||||
neon_xlogflush(PG_FUNCTION_ARGS)
|
||||
{
|
||||
XLogRecPtr lsn = PG_GETARG_LSN(0);
|
||||
XLogRecPtr lsn;
|
||||
|
||||
if (RecoveryInProgress())
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("recovery is in progress"),
|
||||
errhint("cannot flush WAL during recovery.")));
|
||||
|
||||
if (!PG_ARGISNULL(0))
|
||||
lsn = PG_GETARG_LSN(0);
|
||||
else
|
||||
{
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
|
||||
/*---
|
||||
* The LSN returned by GetXLogInsertRecPtr() is the position where the
|
||||
* next inserted record would begin. If the last record ended just at
|
||||
* the page boundary, the next record will begin after the page header
|
||||
* on the next page, and that's what GetXLogInsertRecPtr().returns,
|
||||
* but the page header has not been written yet. If we tried to flush
|
||||
* it, XLogFlush() would throw an error:
|
||||
*
|
||||
* ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
|
||||
*
|
||||
* To avoid that, if the insert position points to just after the page
|
||||
* header, back off to page boundary.
|
||||
*/
|
||||
if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
|
||||
XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ)
|
||||
lsn -= SizeOfXLogShortPHD;
|
||||
else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
|
||||
XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ)
|
||||
lsn -= SizeOfXLogLongPHD;
|
||||
}
|
||||
|
||||
XLogFlush(lsn);
|
||||
PG_RETURN_VOID();
|
||||
|
||||
@@ -168,16 +168,15 @@ close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flag
|
||||
static void
|
||||
enter_seccomp_mode(void)
|
||||
{
|
||||
|
||||
/*
|
||||
* The pageserver process relies on us to close all the file descriptors
|
||||
* it potentially leaked to us, _before_ we start processing potentially dangerous
|
||||
* wal records. See the comment in the Rust code that launches this process.
|
||||
*/
|
||||
int err;
|
||||
if (err = close_range_syscall(3, ~0U, 0)) {
|
||||
ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
|
||||
}
|
||||
if (close_range_syscall(3, ~0U, 0) != 0)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_SYSTEM_ERROR),
|
||||
errmsg("seccomp: could not close files >= fd 3")));
|
||||
|
||||
PgSeccompRule syscalls[] =
|
||||
{
|
||||
|
||||
@@ -10,7 +10,7 @@ use itertools::Itertools;
|
||||
use proxy::config::TlsServerEndPoint;
|
||||
use proxy::context::RequestMonitoring;
|
||||
use proxy::metrics::{Metrics, ThreadPoolMetrics};
|
||||
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
|
||||
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
|
||||
use rustls::pki_types::PrivateKeyDer;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
@@ -286,7 +286,10 @@ async fn handle_client(
|
||||
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
info!("performing the proxy pass...");
|
||||
let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
|
||||
|
||||
Ok(())
|
||||
match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await {
|
||||
Ok(_) => Ok(()),
|
||||
Err(ErrorSource::Client(err)) => Err(err).context("client"),
|
||||
Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,12 +103,8 @@ impl ConnCfg {
|
||||
|
||||
/// Reuse password or auth keys from the other config.
|
||||
pub fn reuse_password(&mut self, other: Self) {
|
||||
if let Some(password) = other.get_password() {
|
||||
self.password(password);
|
||||
}
|
||||
|
||||
if let Some(keys) = other.get_auth_keys() {
|
||||
self.auth_keys(keys);
|
||||
if let Some(password) = other.get_auth() {
|
||||
self.auth(password);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,48 +120,64 @@ impl ConnCfg {
|
||||
|
||||
/// Apply startup message params to the connection config.
|
||||
pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
if let (None, Some(user)) = (self.get_user(), params.get("user")) {
|
||||
self.user(user);
|
||||
}
|
||||
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
|
||||
self.dbname(dbname);
|
||||
}
|
||||
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(params) {
|
||||
self.options(&options);
|
||||
}
|
||||
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
self.application_name(app_name);
|
||||
}
|
||||
|
||||
// TODO: This is especially ugly...
|
||||
if let Some(replication) = params.get("replication") {
|
||||
use tokio_postgres::config::ReplicationMode;
|
||||
match replication {
|
||||
"true" | "on" | "yes" | "1" => {
|
||||
self.replication_mode(ReplicationMode::Physical);
|
||||
let mut client_encoding = false;
|
||||
for (k, v) in params.iter() {
|
||||
match k {
|
||||
"user" => {
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
if self.get_user().is_none() {
|
||||
self.user(v);
|
||||
}
|
||||
}
|
||||
"database" => {
|
||||
self.replication_mode(ReplicationMode::Logical);
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
if self.get_dbname().is_none() {
|
||||
self.dbname(v);
|
||||
}
|
||||
}
|
||||
"options" => {
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(v) {
|
||||
self.options(&options);
|
||||
}
|
||||
}
|
||||
|
||||
// the special ones in tokio-postgres that we don't want being set by the user
|
||||
"dbname" => {}
|
||||
"password" => {}
|
||||
"sslmode" => {}
|
||||
"host" => {}
|
||||
"port" => {}
|
||||
"connect_timeout" => {}
|
||||
"keepalives" => {}
|
||||
"keepalives_idle" => {}
|
||||
"keepalives_interval" => {}
|
||||
"keepalives_retries" => {}
|
||||
"target_session_attrs" => {}
|
||||
"channel_binding" => {}
|
||||
"max_backend_message_size" => {}
|
||||
|
||||
"client_encoding" => {
|
||||
client_encoding = true;
|
||||
// only error should be from bad null bytes,
|
||||
// but we've already checked for those.
|
||||
_ = self.param("client_encoding", v);
|
||||
}
|
||||
|
||||
_ => {
|
||||
// only error should be from bad null bytes,
|
||||
// but we've already checked for those.
|
||||
_ = self.param(k, v);
|
||||
}
|
||||
_other => {}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: extend the list of the forwarded startup parameters.
|
||||
// Currently, tokio-postgres doesn't allow us to pass
|
||||
// arbitrary parameters, but the ones above are a good start.
|
||||
//
|
||||
// This and the reverse params problem can be better addressed
|
||||
// in a bespoke connection machinery (a new library for that sake).
|
||||
if !client_encoding {
|
||||
// for compatibility since we removed it from tokio-postgres
|
||||
self.param("client_encoding", "UTF8").unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,10 +350,9 @@ impl ConnCfg {
|
||||
}
|
||||
|
||||
/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
|
||||
fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
||||
fn filtered_options(options: &str) -> Option<String> {
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = params
|
||||
.options_raw()?
|
||||
let options: String = StartupMessageParams::parse_options_raw(options)
|
||||
.filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
@@ -413,27 +424,23 @@ mod tests {
|
||||
#[test]
|
||||
fn test_filtered_options() {
|
||||
// Empty options is unlikely to be useful anyway.
|
||||
let params = StartupMessageParams::new([("options", "")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
assert_eq!(filtered_options(""), None);
|
||||
|
||||
// It's likely that clients will only use options to specify endpoint/project.
|
||||
let params = StartupMessageParams::new([("options", "project=foo")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
let params = "project=foo";
|
||||
assert_eq!(filtered_options(params), None);
|
||||
|
||||
// Same, because unescaped whitespaces are no-op.
|
||||
let params = StartupMessageParams::new([("options", " project=foo ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), None);
|
||||
let params = " project=foo ";
|
||||
assert_eq!(filtered_options(params), None);
|
||||
|
||||
let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ "));
|
||||
let params = r"\ project=foo \ ";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ "));
|
||||
|
||||
let params = StartupMessageParams::new([("options", "project = foo")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
let params = "project = foo";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
|
||||
|
||||
let params = StartupMessageParams::new([(
|
||||
"options",
|
||||
"project = foo neon_endpoint_type:read_write neon_lsn:0/2",
|
||||
)]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::fmt::{self, Display};
|
||||
use crate::auth::IpPattern;
|
||||
|
||||
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
|
||||
use crate::proxy::retry::ShouldRetry;
|
||||
use crate::proxy::retry::CouldRetry;
|
||||
|
||||
/// Generic error response with human-readable description.
|
||||
/// Note that we can't always present it to user as is.
|
||||
@@ -64,45 +64,47 @@ impl Display for ConsoleError {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for ConsoleError {
|
||||
impl CouldRetry for ConsoleError {
|
||||
fn could_retry(&self) -> bool {
|
||||
if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() {
|
||||
// retry some temporary failures because the compute was in a bad state
|
||||
// (bad request can be returned when the endpoint was in transition)
|
||||
return match &self {
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::BAD_REQUEST,
|
||||
..
|
||||
} => true,
|
||||
// don't retry when quotas are exceeded
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref error,
|
||||
..
|
||||
} => !error.contains("compute time quota of non-primary branches is exceeded"),
|
||||
// locked can be returned when the endpoint was in transition
|
||||
// or when quotas are exceeded. don't retry when quotas are exceeded
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::LOCKED,
|
||||
ref error,
|
||||
..
|
||||
} => {
|
||||
!error.contains("quota exceeded")
|
||||
&& !error.contains("the limit for current plan reached")
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
// If the error message does not have a status,
|
||||
// the error is unknown and probably should not retry automatically
|
||||
let Some(status) = &self.status else {
|
||||
return false;
|
||||
};
|
||||
|
||||
// retry if the retry info is set.
|
||||
if status.details.retry_info.is_some() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// retry if the response has a retry delay
|
||||
if let Some(retry_info) = self
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.retry_info.as_ref())
|
||||
{
|
||||
retry_info.retry_delay_ms > 0
|
||||
} else {
|
||||
false
|
||||
// if no retry info set, attempt to use the error code to guess the retry state.
|
||||
let reason = status
|
||||
.details
|
||||
.error_info
|
||||
.map_or(Reason::Unknown, |e| e.reason);
|
||||
match reason {
|
||||
// not a transitive error
|
||||
Reason::RoleProtected => false,
|
||||
// on retry, it will still not be found
|
||||
Reason::ResourceNotFound
|
||||
| Reason::ProjectNotFound
|
||||
| Reason::EndpointNotFound
|
||||
| Reason::BranchNotFound => false,
|
||||
// we were asked to go away
|
||||
Reason::RateLimitExceeded
|
||||
| Reason::NonDefaultBranchComputeTimeExceeded
|
||||
| Reason::ActiveTimeQuotaExceeded
|
||||
| Reason::ComputeTimeQuotaExceeded
|
||||
| Reason::WrittenDataQuotaExceeded
|
||||
| Reason::DataTransferQuotaExceeded
|
||||
| Reason::LogicalSizeQuotaExceeded => false,
|
||||
// transitive error. control plane is currently busy
|
||||
// but might be ready soon
|
||||
Reason::RunningOperations => true,
|
||||
Reason::ConcurrencyLimitReached => true,
|
||||
Reason::LockAlreadyTaken => true,
|
||||
// unknown error. better not retry it.
|
||||
Reason::Unknown => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -121,7 +123,7 @@ pub struct Details {
|
||||
pub user_facing_message: Option<UserFacingMessage>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Copy, Clone, Debug, Deserialize)]
|
||||
pub struct ErrorInfo {
|
||||
pub reason: Reason,
|
||||
// Schema could also have `metadata` field, but it's not structured. Skip it for now.
|
||||
@@ -129,30 +131,59 @@ pub struct ErrorInfo {
|
||||
|
||||
#[derive(Clone, Copy, Debug, Deserialize, Default)]
|
||||
pub enum Reason {
|
||||
/// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
|
||||
#[serde(rename = "ROLE_PROTECTED")]
|
||||
RoleProtected,
|
||||
/// ResourceNotFound indicates that a resource (project, endpoint, branch, etc.) wasn't found,
|
||||
/// usually due to the provided ID not being correct or because the subject doesn't have enough permissions to
|
||||
/// access the requested resource.
|
||||
/// Prefer a more specific reason if possible, e.g., ProjectNotFound, EndpointNotFound, etc.
|
||||
#[serde(rename = "RESOURCE_NOT_FOUND")]
|
||||
ResourceNotFound,
|
||||
/// ProjectNotFound indicates that the project wasn't found, usually due to the provided ID not being correct,
|
||||
/// or that the subject doesn't have enough permissions to access the requested project.
|
||||
#[serde(rename = "PROJECT_NOT_FOUND")]
|
||||
ProjectNotFound,
|
||||
/// EndpointNotFound indicates that the endpoint wasn't found, usually due to the provided ID not being correct,
|
||||
/// or that the subject doesn't have enough permissions to access the requested endpoint.
|
||||
#[serde(rename = "ENDPOINT_NOT_FOUND")]
|
||||
EndpointNotFound,
|
||||
/// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct,
|
||||
/// or that the subject doesn't have enough permissions to access the requested branch.
|
||||
#[serde(rename = "BRANCH_NOT_FOUND")]
|
||||
BranchNotFound,
|
||||
/// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
|
||||
#[serde(rename = "RATE_LIMIT_EXCEEDED")]
|
||||
RateLimitExceeded,
|
||||
/// NonDefaultBranchComputeTimeExceeded indicates that the compute time quota of non-default branches has been
|
||||
/// exceeded.
|
||||
#[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
|
||||
NonPrimaryBranchComputeTimeExceeded,
|
||||
NonDefaultBranchComputeTimeExceeded,
|
||||
/// ActiveTimeQuotaExceeded indicates that the active time quota was exceeded.
|
||||
#[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
|
||||
ActiveTimeQuotaExceeded,
|
||||
/// ComputeTimeQuotaExceeded indicates that the compute time quota was exceeded.
|
||||
#[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
|
||||
ComputeTimeQuotaExceeded,
|
||||
/// WrittenDataQuotaExceeded indicates that the written data quota was exceeded.
|
||||
#[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
|
||||
WrittenDataQuotaExceeded,
|
||||
/// DataTransferQuotaExceeded indicates that the data transfer quota was exceeded.
|
||||
#[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
|
||||
DataTransferQuotaExceeded,
|
||||
/// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded.
|
||||
#[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
|
||||
LogicalSizeQuotaExceeded,
|
||||
/// RunningOperations indicates that the project already has some running operations
|
||||
/// and scheduling of new ones is prohibited.
|
||||
#[serde(rename = "RUNNING_OPERATIONS")]
|
||||
RunningOperations,
|
||||
/// ConcurrencyLimitReached indicates that the concurrency limit for an action was reached.
|
||||
#[serde(rename = "CONCURRENCY_LIMIT_REACHED")]
|
||||
ConcurrencyLimitReached,
|
||||
/// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
|
||||
#[serde(rename = "LOCK_ALREADY_TAKEN")]
|
||||
LockAlreadyTaken,
|
||||
#[default]
|
||||
#[serde(other)]
|
||||
Unknown,
|
||||
@@ -170,7 +201,7 @@ impl Reason {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Copy, Clone, Debug, Deserialize)]
|
||||
pub struct RetryInfo {
|
||||
pub retry_delay_ms: u64,
|
||||
}
|
||||
|
||||
@@ -25,9 +25,9 @@ use tracing::info;
|
||||
|
||||
pub mod errors {
|
||||
use crate::{
|
||||
console::messages::{self, ConsoleError},
|
||||
console::messages::{self, ConsoleError, Reason},
|
||||
error::{io_error, ReportableError, UserFacingError},
|
||||
proxy::retry::ShouldRetry,
|
||||
proxy::retry::CouldRetry,
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
@@ -76,21 +76,22 @@ pub mod errors {
|
||||
ApiError::Console(e) => {
|
||||
use crate::error::ErrorKind::*;
|
||||
match e.get_reason() {
|
||||
crate::console::messages::Reason::RoleProtected => User,
|
||||
crate::console::messages::Reason::ResourceNotFound => User,
|
||||
crate::console::messages::Reason::ProjectNotFound => User,
|
||||
crate::console::messages::Reason::EndpointNotFound => User,
|
||||
crate::console::messages::Reason::BranchNotFound => User,
|
||||
crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit,
|
||||
crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
|
||||
User
|
||||
}
|
||||
crate::console::messages::Reason::ActiveTimeQuotaExceeded => User,
|
||||
crate::console::messages::Reason::ComputeTimeQuotaExceeded => User,
|
||||
crate::console::messages::Reason::WrittenDataQuotaExceeded => User,
|
||||
crate::console::messages::Reason::DataTransferQuotaExceeded => User,
|
||||
crate::console::messages::Reason::LogicalSizeQuotaExceeded => User,
|
||||
crate::console::messages::Reason::Unknown => match &e {
|
||||
Reason::RoleProtected => User,
|
||||
Reason::ResourceNotFound => User,
|
||||
Reason::ProjectNotFound => User,
|
||||
Reason::EndpointNotFound => User,
|
||||
Reason::BranchNotFound => User,
|
||||
Reason::RateLimitExceeded => ServiceRateLimit,
|
||||
Reason::NonDefaultBranchComputeTimeExceeded => User,
|
||||
Reason::ActiveTimeQuotaExceeded => User,
|
||||
Reason::ComputeTimeQuotaExceeded => User,
|
||||
Reason::WrittenDataQuotaExceeded => User,
|
||||
Reason::DataTransferQuotaExceeded => User,
|
||||
Reason::LogicalSizeQuotaExceeded => User,
|
||||
Reason::ConcurrencyLimitReached => ControlPlane,
|
||||
Reason::LockAlreadyTaken => ControlPlane,
|
||||
Reason::RunningOperations => ControlPlane,
|
||||
Reason::Unknown => match &e {
|
||||
ConsoleError {
|
||||
http_status_code:
|
||||
http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
@@ -128,7 +129,7 @@ pub mod errors {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for ApiError {
|
||||
impl CouldRetry for ApiError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
// retry some transport errors
|
||||
@@ -239,6 +240,17 @@ pub mod errors {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CouldRetry for WakeComputeError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
WakeComputeError::BadComputeAddress(_) => false,
|
||||
WakeComputeError::ApiError(e) => e.could_retry(),
|
||||
WakeComputeError::TooManyConnections => false,
|
||||
WakeComputeError::TooManyConnectionAttempts(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Auth secret which is managed by the cloud.
|
||||
|
||||
@@ -8,6 +8,7 @@ pub mod passthrough;
|
||||
pub mod retry;
|
||||
pub mod wake_compute;
|
||||
pub use copy_bidirectional::copy_bidirectional_client_compute;
|
||||
pub use copy_bidirectional::ErrorSource;
|
||||
|
||||
use crate::{
|
||||
auth,
|
||||
@@ -148,8 +149,11 @@ pub async fn task_main(
|
||||
ctx.log_connect();
|
||||
match p.proxy_pass().instrument(span.clone()).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(parent: &span, "per-client task finished with an error: {e:#}");
|
||||
Err(ErrorSource::Client(e)) => {
|
||||
error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
|
||||
}
|
||||
Err(ErrorSource::Compute(e)) => {
|
||||
error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use crate::{
|
||||
error::ReportableError,
|
||||
metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
|
||||
proxy::{
|
||||
retry::{retry_after, ShouldRetry},
|
||||
retry::{retry_after, should_retry, CouldRetry},
|
||||
wake_compute::wake_compute,
|
||||
},
|
||||
Host,
|
||||
@@ -17,6 +17,8 @@ use pq_proto::StartupMessageParams;
|
||||
use tokio::time;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use super::retry::ShouldRetryWakeCompute;
|
||||
|
||||
const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
|
||||
|
||||
/// If we couldn't connect, a cached connection info might be to blame
|
||||
@@ -104,7 +106,7 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
||||
connect_to_compute_retry_config: RetryConfig,
|
||||
) -> Result<M::Connection, M::Error>
|
||||
where
|
||||
M::ConnectError: ShouldRetry + std::fmt::Debug,
|
||||
M::ConnectError: CouldRetry + ShouldRetryWakeCompute + std::fmt::Debug,
|
||||
M::Error: From<WakeComputeError>,
|
||||
{
|
||||
let mut num_retries = 0;
|
||||
@@ -139,10 +141,10 @@ where
|
||||
|
||||
error!(error = ?err, "could not connect to compute node");
|
||||
|
||||
let node_info = if !node_info.cached() || !err.should_retry_database_address() {
|
||||
let node_info = if !node_info.cached() || !err.should_retry_wake_compute() {
|
||||
// If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
|
||||
// Do not need to retrieve a new node_info, just return the old one.
|
||||
if !err.should_retry(num_retries, connect_to_compute_retry_config) {
|
||||
if should_retry(&err, num_retries, connect_to_compute_retry_config) {
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Failed,
|
||||
@@ -188,9 +190,8 @@ where
|
||||
return Ok(res);
|
||||
}
|
||||
Err(e) => {
|
||||
let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
|
||||
if !retriable {
|
||||
error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
|
||||
if !should_retry(&e, num_retries, connect_to_compute_retry_config) {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't connect to compute node");
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Failed,
|
||||
@@ -200,9 +201,10 @@ where
|
||||
);
|
||||
return Err(e.into());
|
||||
}
|
||||
warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
|
||||
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't connect to compute node");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
|
||||
num_retries += 1;
|
||||
|
||||
@@ -13,12 +13,39 @@ enum TransferState {
|
||||
Done(u64),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ErrorDirection {
|
||||
Read(io::Error),
|
||||
Write(io::Error),
|
||||
}
|
||||
|
||||
impl ErrorSource {
|
||||
fn from_client(err: ErrorDirection) -> ErrorSource {
|
||||
match err {
|
||||
ErrorDirection::Read(client) => Self::Client(client),
|
||||
ErrorDirection::Write(compute) => Self::Compute(compute),
|
||||
}
|
||||
}
|
||||
fn from_compute(err: ErrorDirection) -> ErrorSource {
|
||||
match err {
|
||||
ErrorDirection::Write(client) => Self::Client(client),
|
||||
ErrorDirection::Read(compute) => Self::Compute(compute),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ErrorSource {
|
||||
Client(io::Error),
|
||||
Compute(io::Error),
|
||||
}
|
||||
|
||||
fn transfer_one_direction<A, B>(
|
||||
cx: &mut Context<'_>,
|
||||
state: &mut TransferState,
|
||||
r: &mut A,
|
||||
w: &mut B,
|
||||
) -> Poll<io::Result<u64>>
|
||||
) -> Poll<Result<u64, ErrorDirection>>
|
||||
where
|
||||
A: AsyncRead + AsyncWrite + Unpin + ?Sized,
|
||||
B: AsyncRead + AsyncWrite + Unpin + ?Sized,
|
||||
@@ -32,7 +59,7 @@ where
|
||||
*state = TransferState::ShuttingDown(count);
|
||||
}
|
||||
TransferState::ShuttingDown(count) => {
|
||||
ready!(w.as_mut().poll_shutdown(cx))?;
|
||||
ready!(w.as_mut().poll_shutdown(cx)).map_err(ErrorDirection::Write)?;
|
||||
*state = TransferState::Done(*count);
|
||||
}
|
||||
TransferState::Done(count) => return Poll::Ready(Ok(*count)),
|
||||
@@ -44,7 +71,7 @@ where
|
||||
pub async fn copy_bidirectional_client_compute<Client, Compute>(
|
||||
client: &mut Client,
|
||||
compute: &mut Compute,
|
||||
) -> Result<(u64, u64), std::io::Error>
|
||||
) -> Result<(u64, u64), ErrorSource>
|
||||
where
|
||||
Client: AsyncRead + AsyncWrite + Unpin + ?Sized,
|
||||
Compute: AsyncRead + AsyncWrite + Unpin + ?Sized,
|
||||
@@ -54,9 +81,11 @@ where
|
||||
|
||||
poll_fn(|cx| {
|
||||
let mut client_to_compute_result =
|
||||
transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
|
||||
transfer_one_direction(cx, &mut client_to_compute, client, compute)
|
||||
.map_err(ErrorSource::from_client)?;
|
||||
let mut compute_to_client_result =
|
||||
transfer_one_direction(cx, &mut compute_to_client, compute, client)?;
|
||||
transfer_one_direction(cx, &mut compute_to_client, compute, client)
|
||||
.map_err(ErrorSource::from_compute)?;
|
||||
|
||||
// Early termination checks from compute to client.
|
||||
if let TransferState::Done(_) = compute_to_client {
|
||||
@@ -65,18 +94,20 @@ where
|
||||
// Initiate shutdown
|
||||
client_to_compute = TransferState::ShuttingDown(buf.amt);
|
||||
client_to_compute_result =
|
||||
transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
|
||||
transfer_one_direction(cx, &mut client_to_compute, client, compute)
|
||||
.map_err(ErrorSource::from_client)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Early termination checks from compute to client.
|
||||
// Early termination checks from client to compute.
|
||||
if let TransferState::Done(_) = client_to_compute {
|
||||
if let TransferState::Running(buf) = &compute_to_client {
|
||||
info!("Client is done, terminate compute");
|
||||
// Initiate shutdown
|
||||
compute_to_client = TransferState::ShuttingDown(buf.amt);
|
||||
compute_to_client_result =
|
||||
transfer_one_direction(cx, &mut compute_to_client, client, compute)?;
|
||||
transfer_one_direction(cx, &mut compute_to_client, compute, client)
|
||||
.map_err(ErrorSource::from_compute)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,7 +169,7 @@ impl CopyBuffer {
|
||||
cx: &mut Context<'_>,
|
||||
mut reader: Pin<&mut R>,
|
||||
mut writer: Pin<&mut W>,
|
||||
) -> Poll<io::Result<usize>>
|
||||
) -> Poll<Result<usize, ErrorDirection>>
|
||||
where
|
||||
R: AsyncRead + ?Sized,
|
||||
W: AsyncWrite + ?Sized,
|
||||
@@ -149,11 +180,11 @@ impl CopyBuffer {
|
||||
// Top up the buffer towards full if we can read a bit more
|
||||
// data - this should improve the chances of a large write
|
||||
if !me.read_done && me.cap < me.buf.len() {
|
||||
ready!(me.poll_fill_buf(cx, reader.as_mut()))?;
|
||||
ready!(me.poll_fill_buf(cx, reader.as_mut())).map_err(ErrorDirection::Read)?;
|
||||
}
|
||||
Poll::Pending
|
||||
}
|
||||
res => res,
|
||||
res => res.map_err(ErrorDirection::Write),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,7 +193,7 @@ impl CopyBuffer {
|
||||
cx: &mut Context<'_>,
|
||||
mut reader: Pin<&mut R>,
|
||||
mut writer: Pin<&mut W>,
|
||||
) -> Poll<io::Result<u64>>
|
||||
) -> Poll<Result<u64, ErrorDirection>>
|
||||
where
|
||||
R: AsyncRead + ?Sized,
|
||||
W: AsyncWrite + ?Sized,
|
||||
@@ -176,12 +207,13 @@ impl CopyBuffer {
|
||||
|
||||
match self.poll_fill_buf(cx, reader.as_mut()) {
|
||||
Poll::Ready(Ok(())) => (),
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))),
|
||||
Poll::Pending => {
|
||||
// Try flushing when the reader has no progress to avoid deadlock
|
||||
// when the reader depends on buffered writer.
|
||||
if self.need_flush {
|
||||
ready!(writer.as_mut().poll_flush(cx))?;
|
||||
ready!(writer.as_mut().poll_flush(cx))
|
||||
.map_err(ErrorDirection::Write)?;
|
||||
self.need_flush = false;
|
||||
}
|
||||
|
||||
@@ -194,10 +226,10 @@ impl CopyBuffer {
|
||||
while self.pos < self.cap {
|
||||
let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?;
|
||||
if i == 0 {
|
||||
return Poll::Ready(Err(io::Error::new(
|
||||
return Poll::Ready(Err(ErrorDirection::Write(io::Error::new(
|
||||
io::ErrorKind::WriteZero,
|
||||
"write zero byte into writer",
|
||||
)));
|
||||
))));
|
||||
} else {
|
||||
self.pos += i;
|
||||
self.amt += i as u64;
|
||||
@@ -216,7 +248,7 @@ impl CopyBuffer {
|
||||
// If we've written all the data and we've seen EOF, flush out the
|
||||
// data and finish the transfer.
|
||||
if self.pos == self.cap && self.read_done {
|
||||
ready!(writer.as_mut().poll_flush(cx))?;
|
||||
ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?;
|
||||
return Poll::Ready(Ok(self.amt));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,13 +10,15 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
|
||||
use super::copy_bidirectional::ErrorSource;
|
||||
|
||||
/// Forward bytes in both directions (client <-> compute).
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn proxy_pass(
|
||||
client: impl AsyncRead + AsyncWrite + Unpin,
|
||||
compute: impl AsyncRead + AsyncWrite + Unpin,
|
||||
aux: MetricsAuxInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), ErrorSource> {
|
||||
let usage = USAGE_METRICS.register(Ids {
|
||||
endpoint_id: aux.endpoint_id,
|
||||
branch_id: aux.branch_id,
|
||||
@@ -66,9 +68,11 @@ pub struct ProxyPassthrough<P, S> {
|
||||
}
|
||||
|
||||
impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
|
||||
pub async fn proxy_pass(self) -> anyhow::Result<()> {
|
||||
pub async fn proxy_pass(self) -> Result<(), ErrorSource> {
|
||||
let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
|
||||
self.compute.cancel_closure.try_cancel_query().await?;
|
||||
if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
|
||||
tracing::error!(?err, "could not cancel the query in the database");
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,20 +2,22 @@ use crate::{compute, config::RetryConfig};
|
||||
use std::{error::Error, io};
|
||||
use tokio::time;
|
||||
|
||||
pub trait ShouldRetry {
|
||||
pub trait CouldRetry {
|
||||
/// Returns true if the error could be retried
|
||||
fn could_retry(&self) -> bool;
|
||||
fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
|
||||
match self {
|
||||
_ if num_retries >= config.max_retries => false,
|
||||
err => err.could_retry(),
|
||||
}
|
||||
}
|
||||
fn should_retry_database_address(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for io::Error {
|
||||
pub trait ShouldRetryWakeCompute {
|
||||
/// Returns true if we need to invalidate the cache for this node.
|
||||
/// If false, we can continue retrying with the current node cache.
|
||||
fn should_retry_wake_compute(&self) -> bool;
|
||||
}
|
||||
|
||||
pub fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
|
||||
num_retries < config.max_retries && err.could_retry()
|
||||
}
|
||||
|
||||
impl CouldRetry for io::Error {
|
||||
fn could_retry(&self) -> bool {
|
||||
use std::io::ErrorKind;
|
||||
matches!(
|
||||
@@ -25,7 +27,7 @@ impl ShouldRetry for io::Error {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for tokio_postgres::error::DbError {
|
||||
impl CouldRetry for tokio_postgres::error::DbError {
|
||||
fn could_retry(&self) -> bool {
|
||||
use tokio_postgres::error::SqlState;
|
||||
matches!(
|
||||
@@ -36,7 +38,9 @@ impl ShouldRetry for tokio_postgres::error::DbError {
|
||||
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
||||
)
|
||||
}
|
||||
fn should_retry_database_address(&self) -> bool {
|
||||
}
|
||||
impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
|
||||
fn should_retry_wake_compute(&self) -> bool {
|
||||
use tokio_postgres::error::SqlState;
|
||||
// Here are errors that happens after the user successfully authenticated to the database.
|
||||
// TODO: there are pgbouncer errors that should be retried, but they are not listed here.
|
||||
@@ -53,7 +57,7 @@ impl ShouldRetry for tokio_postgres::error::DbError {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for tokio_postgres::Error {
|
||||
impl CouldRetry for tokio_postgres::Error {
|
||||
fn could_retry(&self) -> bool {
|
||||
if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
|
||||
io::Error::could_retry(io_err)
|
||||
@@ -63,29 +67,33 @@ impl ShouldRetry for tokio_postgres::Error {
|
||||
false
|
||||
}
|
||||
}
|
||||
fn should_retry_database_address(&self) -> bool {
|
||||
if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
|
||||
io::Error::should_retry_database_address(io_err)
|
||||
} else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
|
||||
tokio_postgres::error::DbError::should_retry_database_address(db_err)
|
||||
}
|
||||
impl ShouldRetryWakeCompute for tokio_postgres::Error {
|
||||
fn should_retry_wake_compute(&self) -> bool {
|
||||
if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
|
||||
tokio_postgres::error::DbError::should_retry_wake_compute(db_err)
|
||||
} else {
|
||||
// likely an IO error. Possible the compute has shutdown and the
|
||||
// cache is stale.
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for compute::ConnectionError {
|
||||
impl CouldRetry for compute::ConnectionError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
compute::ConnectionError::Postgres(err) => err.could_retry(),
|
||||
compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
|
||||
compute::ConnectionError::WakeComputeError(err) => err.could_retry(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
fn should_retry_database_address(&self) -> bool {
|
||||
}
|
||||
impl ShouldRetryWakeCompute for compute::ConnectionError {
|
||||
fn should_retry_wake_compute(&self) -> bool {
|
||||
match self {
|
||||
compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
|
||||
compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
|
||||
compute::ConnectionError::Postgres(err) => err.should_retry_wake_compute(),
|
||||
// the cache entry was not checked for validity
|
||||
compute::ConnectionError::TooManyConnectionAttempts(_) => false,
|
||||
_ => true,
|
||||
|
||||
@@ -5,21 +5,21 @@ mod mitm;
|
||||
use std::time::Duration;
|
||||
|
||||
use super::connect_compute::ConnectMechanism;
|
||||
use super::retry::ShouldRetry;
|
||||
use super::retry::CouldRetry;
|
||||
use super::*;
|
||||
use crate::auth::backend::{
|
||||
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
|
||||
};
|
||||
use crate::config::{CertResolver, RetryConfig};
|
||||
use crate::console::caches::NodeInfoCache;
|
||||
use crate::console::messages::{ConsoleError, MetricsAuxInfo};
|
||||
use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status};
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::proxy::retry::retry_after;
|
||||
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
|
||||
use anyhow::{bail, Context};
|
||||
use async_trait::async_trait;
|
||||
use retry::{retry_after, ShouldRetryWakeCompute};
|
||||
use rstest::rstest;
|
||||
use rustls::pki_types;
|
||||
use tokio_postgres::config::SslMode;
|
||||
@@ -438,11 +438,16 @@ impl std::fmt::Display for TestConnectError {
|
||||
|
||||
impl std::error::Error for TestConnectError {}
|
||||
|
||||
impl ShouldRetry for TestConnectError {
|
||||
impl CouldRetry for TestConnectError {
|
||||
fn could_retry(&self) -> bool {
|
||||
self.retryable
|
||||
}
|
||||
}
|
||||
impl ShouldRetryWakeCompute for TestConnectError {
|
||||
fn should_retry_wake_compute(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TestConnectMechanism {
|
||||
@@ -485,7 +490,7 @@ impl TestBackend for TestConnectMechanism {
|
||||
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
|
||||
ConnectAction::WakeFail => {
|
||||
let err = console::errors::ApiError::Console(ConsoleError {
|
||||
http_status_code: http::StatusCode::FORBIDDEN,
|
||||
http_status_code: http::StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: None,
|
||||
});
|
||||
@@ -496,7 +501,15 @@ impl TestBackend for TestConnectMechanism {
|
||||
let err = console::errors::ApiError::Console(ConsoleError {
|
||||
http_status_code: http::StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: None,
|
||||
status: Some(Status {
|
||||
code: "error".into(),
|
||||
message: "error".into(),
|
||||
details: Details {
|
||||
error_info: None,
|
||||
retry_info: Some(console::messages::RetryInfo { retry_delay_ms: 1 }),
|
||||
user_facing_message: None,
|
||||
},
|
||||
}),
|
||||
});
|
||||
assert!(err.could_retry());
|
||||
Err(console::errors::WakeComputeError::ApiError(err))
|
||||
|
||||
@@ -1,18 +1,16 @@
|
||||
use crate::config::RetryConfig;
|
||||
use crate::console::messages::ConsoleError;
|
||||
use crate::console::messages::{ConsoleError, Reason};
|
||||
use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::{
|
||||
ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
|
||||
WakeupFailureKind,
|
||||
};
|
||||
use crate::proxy::retry::retry_after;
|
||||
use crate::proxy::retry::{retry_after, should_retry};
|
||||
use hyper1::StatusCode;
|
||||
use std::ops::ControlFlow;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use super::connect_compute::ComputeConnectBackend;
|
||||
use super::retry::ShouldRetry;
|
||||
|
||||
pub async fn wake_compute<B: ComputeConnectBackend>(
|
||||
num_retries: &mut u32,
|
||||
@@ -22,9 +20,8 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
let retry_type = RetryType::WakeCompute;
|
||||
loop {
|
||||
let wake_res = api.wake_compute(ctx).await;
|
||||
match handle_try_wake(wake_res, *num_retries, config) {
|
||||
Err(e) => {
|
||||
match api.wake_compute(ctx).await {
|
||||
Err(e) if !should_retry(&e, *num_retries, config) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
report_error(&e, false);
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
@@ -36,11 +33,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
|
||||
);
|
||||
return Err(e);
|
||||
}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
Err(e) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
report_error(&e, true);
|
||||
}
|
||||
Ok(ControlFlow::Break(n)) => {
|
||||
Ok(n) => {
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Success,
|
||||
@@ -63,70 +60,28 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempts to wake up the compute node.
|
||||
/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
|
||||
/// * Returns Ok(Break(node)) if the wakeup succeeded
|
||||
/// * Returns Err(e) if there was an error
|
||||
pub fn handle_try_wake(
|
||||
result: Result<CachedNodeInfo, WakeComputeError>,
|
||||
num_retries: u32,
|
||||
config: RetryConfig,
|
||||
) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
|
||||
match result {
|
||||
Err(err) => match &err {
|
||||
WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
|
||||
Ok(ControlFlow::Continue(err))
|
||||
}
|
||||
_ => Err(err),
|
||||
},
|
||||
// Ready to try again.
|
||||
Ok(new) => Ok(ControlFlow::Break(new)),
|
||||
}
|
||||
}
|
||||
|
||||
fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
use crate::console::errors::ApiError;
|
||||
let kind = match e {
|
||||
WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
|
||||
WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
|
||||
WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() {
|
||||
crate::console::messages::Reason::RoleProtected => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::ResourceNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::ProjectNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::EndpointNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::BranchNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::RateLimitExceeded => {
|
||||
WakeupFailureKind::ApiConsoleLocked
|
||||
}
|
||||
crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::ActiveTimeQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::ComputeTimeQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::WrittenDataQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::DataTransferQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::LogicalSizeQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::Unknown => match e {
|
||||
Reason::RoleProtected => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
Reason::ResourceNotFound => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
Reason::ProjectNotFound => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
Reason::EndpointNotFound => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
Reason::BranchNotFound => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
Reason::RateLimitExceeded => WakeupFailureKind::ApiConsoleLocked,
|
||||
Reason::NonDefaultBranchComputeTimeExceeded => WakeupFailureKind::QuotaExceeded,
|
||||
Reason::ActiveTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
|
||||
Reason::ComputeTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
|
||||
Reason::WrittenDataQuotaExceeded => WakeupFailureKind::QuotaExceeded,
|
||||
Reason::DataTransferQuotaExceeded => WakeupFailureKind::QuotaExceeded,
|
||||
Reason::LogicalSizeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
|
||||
Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked,
|
||||
Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked,
|
||||
Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked,
|
||||
Reason::Unknown => match e {
|
||||
ConsoleError {
|
||||
http_status_code: StatusCode::LOCKED,
|
||||
ref error,
|
||||
|
||||
@@ -16,7 +16,10 @@ use crate::{
|
||||
context::RequestMonitoring,
|
||||
error::{ErrorKind, ReportableError, UserFacingError},
|
||||
intern::EndpointIdInt,
|
||||
proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
|
||||
proxy::{
|
||||
connect_compute::ConnectMechanism,
|
||||
retry::{CouldRetry, ShouldRetryWakeCompute},
|
||||
},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
Host,
|
||||
};
|
||||
@@ -179,7 +182,7 @@ impl UserFacingError for HttpConnError {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for HttpConnError {
|
||||
impl CouldRetry for HttpConnError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
HttpConnError::ConnectionError(e) => e.could_retry(),
|
||||
@@ -190,9 +193,11 @@ impl ShouldRetry for HttpConnError {
|
||||
HttpConnError::TooManyConnectionAttempts(_) => false,
|
||||
}
|
||||
}
|
||||
fn should_retry_database_address(&self) -> bool {
|
||||
}
|
||||
impl ShouldRetryWakeCompute for HttpConnError {
|
||||
fn should_retry_wake_compute(&self) -> bool {
|
||||
match self {
|
||||
HttpConnError::ConnectionError(e) => e.should_retry_database_address(),
|
||||
HttpConnError::ConnectionError(e) => e.should_retry_wake_compute(),
|
||||
// we never checked cache validity
|
||||
HttpConnError::TooManyConnectionAttempts(_) => false,
|
||||
_ => true,
|
||||
@@ -231,6 +236,10 @@ impl ConnectMechanism for TokioMechanism {
|
||||
.dbname(&self.conn_info.dbname)
|
||||
.connect_timeout(timeout);
|
||||
|
||||
config
|
||||
.param("client_encoding", "UTF8")
|
||||
.expect("client encoding UTF8 is always valid");
|
||||
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
let res = config.connect(tokio_postgres::NoTls).await;
|
||||
drop(pause);
|
||||
|
||||
@@ -202,6 +202,7 @@ fn get_conn_info(
|
||||
options = Some(NeonOptions::parse_options_raw(&value));
|
||||
}
|
||||
}
|
||||
ctx.set_db_options(params.freeze());
|
||||
|
||||
let user_info = ComputeUserInfo {
|
||||
endpoint,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::proxy::ErrorSource;
|
||||
use crate::{
|
||||
cancellation::CancellationHandlerMain,
|
||||
config::ProxyConfig,
|
||||
@@ -7,6 +8,7 @@ use crate::{
|
||||
proxy::{handle_client, ClientMode},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
};
|
||||
use anyhow::Context as _;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use framed_websockets::{Frame, OpCode, WebSocketServer};
|
||||
use futures::{Sink, Stream};
|
||||
@@ -165,7 +167,11 @@ pub async fn serve_websocket(
|
||||
Ok(Some(p)) => {
|
||||
ctx.set_success();
|
||||
ctx.log_connect();
|
||||
p.proxy_pass().await
|
||||
match p.proxy_pass().await {
|
||||
Ok(()) => Ok(()),
|
||||
Err(ErrorSource::Client(err)) => Err(err).context("client"),
|
||||
Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,7 +28,8 @@ use utils::pid_file;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
use safekeeper::defaults::{
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
|
||||
DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
|
||||
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
|
||||
};
|
||||
use safekeeper::http;
|
||||
@@ -172,6 +173,7 @@ struct Args {
|
||||
walsenders_keep_horizon: bool,
|
||||
/// Enable partial backup. If disabled, safekeeper will not upload partial
|
||||
/// segments to remote storage.
|
||||
/// TODO: now partial backup is always enabled, remove this flag.
|
||||
#[arg(long)]
|
||||
partial_backup_enabled: bool,
|
||||
/// Controls how long backup will wait until uploading the partial segment.
|
||||
@@ -181,6 +183,18 @@ struct Args {
|
||||
/// be used in tests.
|
||||
#[arg(long)]
|
||||
disable_periodic_broker_push: bool,
|
||||
/// Enable automatic switching to offloaded state.
|
||||
#[arg(long)]
|
||||
enable_offload: bool,
|
||||
/// Delete local WAL files after offloading. When disabled, they will be left on disk.
|
||||
#[arg(long)]
|
||||
delete_offloaded_wal: bool,
|
||||
/// Pending updates to control file will be automatically saved after this interval.
|
||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)]
|
||||
control_file_save_interval: Duration,
|
||||
/// Number of allowed concurrent uploads of partial segments to remote storage.
|
||||
#[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)]
|
||||
partial_backup_concurrency: usize,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
@@ -328,9 +342,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
sk_auth_token,
|
||||
current_thread_runtime: args.current_thread_runtime,
|
||||
walsenders_keep_horizon: args.walsenders_keep_horizon,
|
||||
partial_backup_enabled: args.partial_backup_enabled,
|
||||
partial_backup_enabled: true,
|
||||
partial_backup_timeout: args.partial_backup_timeout,
|
||||
disable_periodic_broker_push: args.disable_periodic_broker_push,
|
||||
enable_offload: args.enable_offload,
|
||||
delete_offloaded_wal: args.delete_offloaded_wal,
|
||||
control_file_save_interval: args.control_file_save_interval,
|
||||
partial_backup_concurrency: args.partial_backup_concurrency,
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
|
||||
@@ -72,6 +72,9 @@ impl FileStorage {
|
||||
conf: &SafeKeeperConf,
|
||||
state: TimelinePersistentState,
|
||||
) -> Result<FileStorage> {
|
||||
// we don't support creating new timelines in offloaded state
|
||||
assert!(matches!(state.eviction_state, EvictionState::Present));
|
||||
|
||||
let store = FileStorage {
|
||||
timeline_dir,
|
||||
no_sync: conf.no_sync,
|
||||
@@ -103,7 +106,7 @@ impl FileStorage {
|
||||
}
|
||||
|
||||
/// Load control file from given directory.
|
||||
pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
|
||||
fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
|
||||
let path = timeline_dir.join(CONTROL_FILE_NAME);
|
||||
Self::load_control_file(path)
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::{
|
||||
control_file::{FileStorage, Storage},
|
||||
pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
|
||||
state::TimelinePersistentState,
|
||||
timeline::{FullAccessTimeline, Timeline, TimelineError},
|
||||
timeline::{Timeline, TimelineError, WalResidentTimeline},
|
||||
wal_backup::copy_s3_segments,
|
||||
wal_storage::{wal_file_paths, WalReader},
|
||||
GlobalTimelines,
|
||||
@@ -46,7 +46,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
let source_tli = request.source.full_access_guard().await?;
|
||||
let source_tli = request.source.wal_residence_guard().await?;
|
||||
|
||||
let conf = &GlobalTimelines::get_global_config();
|
||||
let ttid = request.destination_ttid;
|
||||
@@ -159,7 +159,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
}
|
||||
|
||||
async fn copy_disk_segments(
|
||||
tli: &FullAccessTimeline,
|
||||
tli: &WalResidentTimeline,
|
||||
wal_seg_size: usize,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
@@ -183,7 +183,7 @@ async fn copy_disk_segments(
|
||||
let copy_end = copy_end - segment_start;
|
||||
|
||||
let wal_file_path = {
|
||||
let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?;
|
||||
let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size);
|
||||
|
||||
if segment == last_segment {
|
||||
partial
|
||||
|
||||
@@ -28,7 +28,8 @@ use crate::send_wal::WalSenderState;
|
||||
use crate::state::TimelineMemState;
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::timeline::get_timeline_dir;
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::timeline_manager;
|
||||
use crate::GlobalTimelines;
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
@@ -168,6 +169,7 @@ pub struct Memory {
|
||||
pub last_removed_segno: XLogSegNo,
|
||||
pub epoch_start_lsn: Lsn,
|
||||
pub mem_state: TimelineMemState,
|
||||
pub mgr_status: timeline_manager::Status,
|
||||
|
||||
// PhysicalStorage state.
|
||||
pub write_lsn: Lsn,
|
||||
@@ -326,7 +328,7 @@ pub struct TimelineDigest {
|
||||
}
|
||||
|
||||
pub async fn calculate_digest(
|
||||
tli: &FullAccessTimeline,
|
||||
tli: &WalResidentTimeline,
|
||||
request: TimelineDigestRequest,
|
||||
) -> Result<TimelineDigest> {
|
||||
if request.from_lsn > request.until_lsn {
|
||||
|
||||
@@ -214,10 +214,10 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
// Note: with evicted timelines it should work better then de-evict them and
|
||||
// stream; probably start_snapshot would copy partial s3 file to dest path
|
||||
// and stream control file, or return FullAccessTimeline if timeline is not
|
||||
// and stream control file, or return WalResidentTimeline if timeline is not
|
||||
// evicted.
|
||||
let tli = tli
|
||||
.full_access_guard()
|
||||
.wal_residence_guard()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -283,7 +283,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let tli = tli
|
||||
.full_access_guard()
|
||||
.wal_residence_guard()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -306,7 +306,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
tli.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
.state
|
||||
.state_mut()
|
||||
.flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -21,7 +21,7 @@ use crate::safekeeper::{
|
||||
};
|
||||
use crate::safekeeper::{Term, TermHistory, TermLsn};
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::GlobalTimelines;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_ffi::encode_logical_message;
|
||||
@@ -102,7 +102,7 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
async fn prepare_safekeeper(
|
||||
ttid: TenantTimelineId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<FullAccessTimeline> {
|
||||
) -> anyhow::Result<WalResidentTimeline> {
|
||||
let tli = GlobalTimelines::create(
|
||||
ttid,
|
||||
ServerInfo {
|
||||
@@ -115,11 +115,11 @@ async fn prepare_safekeeper(
|
||||
)
|
||||
.await?;
|
||||
|
||||
tli.full_access_guard().await
|
||||
tli.wal_residence_guard().await
|
||||
}
|
||||
|
||||
async fn send_proposer_elected(
|
||||
tli: &FullAccessTimeline,
|
||||
tli: &WalResidentTimeline,
|
||||
term: Term,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -151,7 +151,7 @@ pub struct InsertedWAL {
|
||||
/// Extend local WAL with new LogicalMessage record. To do that,
|
||||
/// create AppendRequest with new WAL and pass it to safekeeper.
|
||||
pub async fn append_logical_message(
|
||||
tli: &FullAccessTimeline,
|
||||
tli: &WalResidentTimeline,
|
||||
msg: &AppendLogicalMessage,
|
||||
) -> anyhow::Result<InsertedWAL> {
|
||||
let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
|
||||
|
||||
@@ -28,6 +28,8 @@ pub mod safekeeper;
|
||||
pub mod send_wal;
|
||||
pub mod state;
|
||||
pub mod timeline;
|
||||
pub mod timeline_eviction;
|
||||
pub mod timeline_guard;
|
||||
pub mod timeline_manager;
|
||||
pub mod timelines_set;
|
||||
pub mod wal_backup;
|
||||
@@ -49,6 +51,8 @@ pub mod defaults {
|
||||
pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
|
||||
pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
|
||||
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
|
||||
pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
|
||||
pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -85,6 +89,10 @@ pub struct SafeKeeperConf {
|
||||
pub partial_backup_enabled: bool,
|
||||
pub partial_backup_timeout: Duration,
|
||||
pub disable_periodic_broker_push: bool,
|
||||
pub enable_offload: bool,
|
||||
pub delete_offloaded_wal: bool,
|
||||
pub control_file_save_interval: Duration,
|
||||
pub partial_backup_concurrency: usize,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -124,6 +132,10 @@ impl SafeKeeperConf {
|
||||
partial_backup_enabled: false,
|
||||
partial_backup_timeout: Duration::from_secs(0),
|
||||
disable_periodic_broker_push: false,
|
||||
enable_offload: false,
|
||||
delete_offloaded_wal: false,
|
||||
control_file_save_interval: Duration::from_secs(1),
|
||||
partial_backup_concurrency: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,15 +5,15 @@ use std::{
|
||||
time::{Instant, SystemTime},
|
||||
};
|
||||
|
||||
use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
|
||||
use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS};
|
||||
use anyhow::Result;
|
||||
use futures::Future;
|
||||
use metrics::{
|
||||
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
|
||||
proto::MetricFamily,
|
||||
register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
|
||||
register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
|
||||
IntGaugeVec,
|
||||
register_histogram_vec, register_int_counter, register_int_counter_pair,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
|
||||
IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
@@ -48,7 +48,7 @@ pub static WRITE_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"safekeeper_write_wal_seconds",
|
||||
"Seconds spent writing and syncing WAL to a disk in a single request",
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
DISK_FSYNC_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_write_wal_seconds histogram")
|
||||
});
|
||||
@@ -56,7 +56,7 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"safekeeper_flush_wal_seconds",
|
||||
"Seconds spent syncing WAL to a disk",
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
DISK_FSYNC_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_flush_wal_seconds histogram")
|
||||
});
|
||||
@@ -64,10 +64,28 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"safekeeper_persist_control_file_seconds",
|
||||
"Seconds to persist and sync control file",
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
DISK_FSYNC_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec")
|
||||
});
|
||||
pub static WAL_STORAGE_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"safekeeper_wal_storage_operation_seconds",
|
||||
"Seconds spent on WAL storage operations",
|
||||
&["operation"],
|
||||
DISK_FSYNC_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec")
|
||||
});
|
||||
pub static MISC_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"safekeeper_misc_operation_seconds",
|
||||
"Seconds spent on miscellaneous operations",
|
||||
&["operation"],
|
||||
DISK_FSYNC_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_misc_operation_seconds histogram vec")
|
||||
});
|
||||
pub static PG_IO_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"safekeeper_pg_io_bytes_total",
|
||||
@@ -126,7 +144,7 @@ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"safekeeper_broker_push_update_seconds",
|
||||
"Seconds to push all timeline updates to the broker",
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
DISK_FSYNC_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_broker_push_update_seconds histogram vec")
|
||||
});
|
||||
|
||||
@@ -32,7 +32,7 @@ use crate::{
|
||||
routes::TimelineStatus,
|
||||
},
|
||||
safekeeper::Term,
|
||||
timeline::{get_tenant_dir, get_timeline_dir, FullAccessTimeline, Timeline, TimelineError},
|
||||
timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline},
|
||||
wal_storage::{self, open_wal_file, Storage},
|
||||
GlobalTimelines, SafeKeeperConf,
|
||||
};
|
||||
@@ -46,7 +46,7 @@ use utils::{
|
||||
|
||||
/// Stream tar archive of timeline to tx.
|
||||
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn stream_snapshot(tli: FullAccessTimeline, tx: mpsc::Sender<Result<Bytes>>) {
|
||||
pub async fn stream_snapshot(tli: WalResidentTimeline, tx: mpsc::Sender<Result<Bytes>>) {
|
||||
if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await {
|
||||
// Error type/contents don't matter as they won't can't reach the client
|
||||
// (hyper likely doesn't do anything with it), but http stream will be
|
||||
@@ -66,7 +66,7 @@ pub struct SnapshotContext {
|
||||
pub flush_lsn: Lsn,
|
||||
pub wal_seg_size: usize,
|
||||
// used to remove WAL hold off in Drop.
|
||||
pub tli: FullAccessTimeline,
|
||||
pub tli: WalResidentTimeline,
|
||||
}
|
||||
|
||||
impl Drop for SnapshotContext {
|
||||
@@ -80,7 +80,7 @@ impl Drop for SnapshotContext {
|
||||
}
|
||||
|
||||
pub async fn stream_snapshot_guts(
|
||||
tli: FullAccessTimeline,
|
||||
tli: WalResidentTimeline,
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
) -> Result<()> {
|
||||
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
|
||||
@@ -135,7 +135,7 @@ pub async fn stream_snapshot_guts(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl FullAccessTimeline {
|
||||
impl WalResidentTimeline {
|
||||
/// Start streaming tar archive with timeline:
|
||||
/// 1) stream control file under lock;
|
||||
/// 2) hold off WAL removal;
|
||||
@@ -160,6 +160,7 @@ impl FullAccessTimeline {
|
||||
ar: &mut tokio_tar::Builder<W>,
|
||||
) -> Result<SnapshotContext> {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
let wal_seg_size = shared_state.get_wal_seg_size();
|
||||
|
||||
let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME);
|
||||
let mut cf = File::open(cf_path).await?;
|
||||
@@ -173,19 +174,19 @@ impl FullAccessTimeline {
|
||||
// lock and setting `wal_removal_on_hold` later, it guarantees that WAL
|
||||
// won't be removed until we're done.
|
||||
let from_lsn = min(
|
||||
shared_state.sk.state.remote_consistent_lsn,
|
||||
shared_state.sk.state.backup_lsn,
|
||||
shared_state.sk.state().remote_consistent_lsn,
|
||||
shared_state.sk.state().backup_lsn,
|
||||
);
|
||||
if from_lsn == Lsn::INVALID {
|
||||
// this is possible if snapshot is called before handling first
|
||||
// elected message
|
||||
bail!("snapshot is called on uninitialized timeline");
|
||||
}
|
||||
let from_segno = from_lsn.segment_number(shared_state.get_wal_seg_size());
|
||||
let term = shared_state.sk.get_term();
|
||||
let last_log_term = shared_state.sk.get_last_log_term();
|
||||
let from_segno = from_lsn.segment_number(wal_seg_size);
|
||||
let term = shared_state.sk.state().acceptor_state.term;
|
||||
let last_log_term = shared_state.sk.last_log_term();
|
||||
let flush_lsn = shared_state.sk.flush_lsn();
|
||||
let upto_segno = flush_lsn.segment_number(shared_state.get_wal_seg_size());
|
||||
let upto_segno = flush_lsn.segment_number(wal_seg_size);
|
||||
// have some limit on max number of segments as a sanity check
|
||||
const MAX_ALLOWED_SEGS: u64 = 1000;
|
||||
let num_segs = upto_segno - from_segno + 1;
|
||||
@@ -206,14 +207,18 @@ impl FullAccessTimeline {
|
||||
}
|
||||
shared_state.wal_removal_on_hold = true;
|
||||
|
||||
// Drop shared_state to release the lock, before calling wal_residence_guard().
|
||||
drop(shared_state);
|
||||
|
||||
let tli_copy = self.wal_residence_guard().await?;
|
||||
let bctx = SnapshotContext {
|
||||
from_segno,
|
||||
upto_segno,
|
||||
term,
|
||||
last_log_term,
|
||||
flush_lsn,
|
||||
wal_seg_size: shared_state.get_wal_seg_size(),
|
||||
tli: self.clone(),
|
||||
wal_seg_size,
|
||||
tli: tli_copy,
|
||||
};
|
||||
|
||||
Ok(bctx)
|
||||
@@ -225,8 +230,8 @@ impl FullAccessTimeline {
|
||||
/// forget this if snapshotting fails mid the way.
|
||||
pub async fn finish_snapshot(&self, bctx: &SnapshotContext) -> Result<()> {
|
||||
let shared_state = self.read_shared_state().await;
|
||||
let term = shared_state.sk.get_term();
|
||||
let last_log_term = shared_state.sk.get_last_log_term();
|
||||
let term = shared_state.sk.state().acceptor_state.term;
|
||||
let last_log_term = shared_state.sk.last_log_term();
|
||||
// There are some cases to relax this check (e.g. last_log_term might
|
||||
// change, but as long as older history is strictly part of new that's
|
||||
// fine), but there is no need to do it.
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::safekeeper::AcceptorProposerMessage;
|
||||
use crate::safekeeper::ProposerAcceptorMessage;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{anyhow, Context};
|
||||
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
) -> Result<(), QueryError> {
|
||||
let mut tli: Option<FullAccessTimeline> = None;
|
||||
let mut tli: Option<WalResidentTimeline> = None;
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
let handle_end_fut = pgb.handle_copy_stream_end(end);
|
||||
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
|
||||
pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tli: &mut Option<FullAccessTimeline>,
|
||||
tli: &mut Option<WalResidentTimeline>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// Notify the libpq client that it's allowed to send `CopyData` messages
|
||||
pgb.write_message(&BeMessage::CopyBothResponse).await?;
|
||||
@@ -269,11 +269,11 @@ impl SafekeeperPostgresHandler {
|
||||
.get_walreceivers()
|
||||
.pageserver_feedback_tx
|
||||
.subscribe();
|
||||
*tli = Some(timeline.clone());
|
||||
*tli = Some(timeline.wal_residence_guard().await?);
|
||||
|
||||
tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
|
||||
}
|
||||
} else {
|
||||
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
|
||||
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
async fn read_first_message(
|
||||
&mut self,
|
||||
) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let tli = match next_msg {
|
||||
@@ -340,7 +340,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
let tli =
|
||||
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await?;
|
||||
tli.full_access_guard().await?
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
_ => {
|
||||
return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
|
||||
@@ -356,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
tli: FullAccessTimeline,
|
||||
tli: WalResidentTimeline,
|
||||
next_msg: ProposerAcceptorMessage,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
*self.acceptor_handle = Some(WalAcceptor::spawn(
|
||||
@@ -451,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
/// replies to reply_tx; reading from socket and writing to disk in parallel is
|
||||
/// beneficial for performance, this struct provides writing to disk part.
|
||||
pub struct WalAcceptor {
|
||||
tli: FullAccessTimeline,
|
||||
tli: WalResidentTimeline,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: Option<ConnectionId>,
|
||||
@@ -464,7 +464,7 @@ impl WalAcceptor {
|
||||
///
|
||||
/// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
|
||||
pub fn spawn(
|
||||
tli: FullAccessTimeline,
|
||||
tli: WalResidentTimeline,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: Option<ConnectionId>,
|
||||
|
||||
@@ -21,7 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}
|
||||
|
||||
use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
|
||||
use crate::safekeeper::{AppendRequest, AppendRequestHeader};
|
||||
use crate::timeline::FullAccessTimeline;
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::{
|
||||
http::routes::TimelineStatus,
|
||||
receive_wal::MSG_QUEUE_SIZE,
|
||||
@@ -36,7 +36,7 @@ use crate::{
|
||||
/// Entrypoint for per timeline task which always runs, checking whether
|
||||
/// recovery for this safekeeper is needed and starting it if so.
|
||||
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) {
|
||||
info!("started");
|
||||
|
||||
let cancel = tli.cancel.clone();
|
||||
@@ -66,12 +66,12 @@ pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
|
||||
/// Thus we don't try to predict it here.
|
||||
async fn recovery_needed(
|
||||
tli: &FullAccessTimeline,
|
||||
tli: &WalResidentTimeline,
|
||||
heartbeat_timeout: Duration,
|
||||
) -> RecoveryNeededInfo {
|
||||
let ss = tli.read_shared_state().await;
|
||||
let term = ss.sk.state.acceptor_state.term;
|
||||
let last_log_term = ss.sk.get_last_log_term();
|
||||
let term = ss.sk.state().acceptor_state.term;
|
||||
let last_log_term = ss.sk.last_log_term();
|
||||
let flush_lsn = ss.sk.flush_lsn();
|
||||
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
|
||||
let mut peers = ss.get_peers(heartbeat_timeout);
|
||||
@@ -195,7 +195,7 @@ impl From<&PeerInfo> for Donor {
|
||||
const CHECK_INTERVAL_MS: u64 = 2000;
|
||||
|
||||
/// Check regularly whether we need to start recovery.
|
||||
async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
async fn recovery_main_loop(tli: WalResidentTimeline, conf: SafeKeeperConf) {
|
||||
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
|
||||
loop {
|
||||
let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
|
||||
@@ -205,7 +205,12 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
"starting recovery from donor {}: {}",
|
||||
donor.sk_id, recovery_needed_info
|
||||
);
|
||||
match recover(tli.clone(), donor, &conf).await {
|
||||
let res = tli.wal_residence_guard().await;
|
||||
if let Err(e) = res {
|
||||
warn!("failed to obtain guard: {}", e);
|
||||
continue;
|
||||
}
|
||||
match recover(res.unwrap(), donor, &conf).await {
|
||||
// Note: 'write_wal rewrites WAL written before' error is
|
||||
// expected here and might happen if compute and recovery
|
||||
// concurrently write the same data. Eventually compute
|
||||
@@ -228,7 +233,7 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
/// Recover from the specified donor. Returns message explaining normal finish
|
||||
/// reason or error.
|
||||
async fn recover(
|
||||
tli: FullAccessTimeline,
|
||||
tli: WalResidentTimeline,
|
||||
donor: &Donor,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<String> {
|
||||
@@ -314,7 +319,7 @@ async fn recover(
|
||||
|
||||
// Pull WAL from donor, assuming handshake is already done.
|
||||
async fn recovery_stream(
|
||||
tli: FullAccessTimeline,
|
||||
tli: WalResidentTimeline,
|
||||
donor: &Donor,
|
||||
start_streaming_at: Lsn,
|
||||
conf: &SafeKeeperConf,
|
||||
@@ -364,10 +369,10 @@ async fn recovery_stream(
|
||||
// As in normal walreceiver, do networking and writing to disk in parallel.
|
||||
let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
|
||||
let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
|
||||
let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
|
||||
let wa = WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, None);
|
||||
|
||||
let res = tokio::select! {
|
||||
r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
|
||||
r = network_io(physical_stream, msg_tx, donor.clone(), tli, conf.clone()) => r,
|
||||
r = read_replies(reply_rx, donor.term) => r.map(|()| None),
|
||||
};
|
||||
|
||||
@@ -398,7 +403,7 @@ async fn network_io(
|
||||
physical_stream: ReplicationStream,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
donor: Donor,
|
||||
tli: FullAccessTimeline,
|
||||
tli: WalResidentTimeline,
|
||||
conf: SafeKeeperConf,
|
||||
) -> anyhow::Result<Option<String>> {
|
||||
let mut physical_stream = pin!(physical_stream);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user