mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-26 23:00:37 +00:00
Compare commits
7 Commits
no_xid_ali
...
problame/m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e68c43c19b | ||
|
|
b2830a48dd | ||
|
|
8b482a8be0 | ||
|
|
81f13e17ce | ||
|
|
ba13f2a90a | ||
|
|
630c8a9b86 | ||
|
|
f12f31ae77 |
@@ -183,7 +183,7 @@ runs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
|
||||
2
.github/actions/download/action.yml
vendored
2
.github/actions/download/action.yml
vendored
@@ -26,7 +26,7 @@ runs:
|
||||
TARGET: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
|
||||
SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
17
.github/actions/run-python-test-set/action.yml
vendored
17
.github/actions/run-python-test-set/action.yml
vendored
@@ -56,14 +56,14 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Download Neon binaries for the previous release
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon-previous
|
||||
prefix: latest
|
||||
|
||||
@@ -89,7 +89,7 @@ runs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
@@ -114,7 +114,6 @@ runs:
|
||||
export PLATFORM=${PLATFORM:-github-actions-selfhosted}
|
||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
||||
export DEFAULT_PG_VERSION=${PG_VERSION#v}
|
||||
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
|
||||
|
||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||
export REMOTE_ENV=1
|
||||
@@ -179,15 +178,7 @@ runs:
|
||||
|
||||
# Wake up the cluster if we use remote neon instance
|
||||
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
|
||||
done
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
|
||||
fi
|
||||
|
||||
# Run the tests.
|
||||
|
||||
4
.github/actions/upload/action.yml
vendored
4
.github/actions/upload/action.yml
vendored
@@ -8,7 +8,7 @@ inputs:
|
||||
description: "A directory or file to upload"
|
||||
required: true
|
||||
prefix:
|
||||
description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
required: false
|
||||
|
||||
runs:
|
||||
@@ -45,7 +45,7 @@ runs:
|
||||
env:
|
||||
SOURCE: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
91
.github/workflows/benchmarking.yml
vendored
91
.github/workflows/benchmarking.yml
vendored
@@ -77,7 +77,7 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
@@ -235,10 +235,15 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
@@ -277,6 +282,16 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -358,17 +373,30 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Benchmark pgvector hnsw indexing
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -389,12 +417,12 @@ jobs:
|
||||
test_selection: performance/test_perf_pgvector_queries.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600
|
||||
extra_params: -m remote_cluster --timeout 21600
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
@@ -445,10 +473,15 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
@@ -470,6 +503,16 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: ClickBench benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -533,10 +576,15 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Get Connstring Secret Name
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
@@ -565,6 +613,16 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Run TPC-H benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -619,10 +677,15 @@ jobs:
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
@@ -644,6 +707,16 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
|
||||
11
.github/workflows/build-build-tools-image.yml
vendored
11
.github/workflows/build-build-tools-image.yml
vendored
@@ -63,16 +63,14 @@ jobs:
|
||||
mkdir -p /tmp/.docker-custom
|
||||
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/build-push-action@v6
|
||||
- uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
provenance: false
|
||||
@@ -80,11 +78,10 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.build-tools
|
||||
cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
|
||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf /tmp/.docker-custom
|
||||
|
||||
|
||||
83
.github/workflows/build_and_test.yml
vendored
83
.github/workflows/build_and_test.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
@@ -109,7 +109,7 @@ jobs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
@@ -149,7 +149,7 @@ jobs:
|
||||
# !~/.cargo/registry/src
|
||||
# ~/.cargo/git/
|
||||
# target/
|
||||
# key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
@@ -291,29 +291,29 @@ jobs:
|
||||
# target/
|
||||
# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
# key: |
|
||||
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
||||
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
|
||||
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
@@ -335,8 +335,6 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
@@ -385,11 +383,6 @@ jobs:
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
#nextest does not yet support running doctests
|
||||
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
@@ -418,7 +411,7 @@ jobs:
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
@@ -497,7 +490,7 @@ jobs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
@@ -646,7 +639,7 @@ jobs:
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Get coverage artifact
|
||||
@@ -751,16 +744,14 @@ jobs:
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/build-push-action@v6
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
@@ -772,7 +763,7 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -831,12 +822,11 @@ jobs:
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
with:
|
||||
cache-binary: false
|
||||
# Disable parallelism for docker buildkit.
|
||||
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
||||
buildkitd-config-inline: |
|
||||
config-inline: |
|
||||
[worker.oci]
|
||||
max-parallelism = 1
|
||||
|
||||
@@ -852,7 +842,7 @@ jobs:
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Build compute-node image
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
@@ -865,13 +855,13 @@ jobs:
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version == 'v16'
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
@@ -885,14 +875,14 @@ jobs:
|
||||
file: Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: matrix.version == 'v16'
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
target: compute-tools-image
|
||||
context: .
|
||||
@@ -1255,7 +1245,6 @@ jobs:
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
@@ -1350,7 +1339,7 @@ jobs:
|
||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||
for build_type in debug release; do
|
||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
@@ -1368,31 +1357,3 @@ jobs:
|
||||
with:
|
||||
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
# This job simplifies setting branch protection rules (in GitHub UI)
|
||||
# by allowing to set only this job instead of listing many others.
|
||||
# It also makes it easier to rename or parametrise jobs (using matrix)
|
||||
# which requires changes in branch protection rules
|
||||
#
|
||||
# Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
|
||||
#
|
||||
# https://github.com/neondatabase/neon/settings/branch_protection_rules
|
||||
conclusion:
|
||||
if: always()
|
||||
# Format `needs` differently to make the list more readable.
|
||||
# Usually we do `needs: [...]`
|
||||
needs:
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
- regress-tests
|
||||
- test-images
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
# The list of possible results:
|
||||
# https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
|
||||
- name: Fail the job if any of the dependencies do not succeed
|
||||
run: exit 1
|
||||
if: |
|
||||
contains(needs.*.result, 'failure')
|
||||
|| contains(needs.*.result, 'cancelled')
|
||||
|| contains(needs.*.result, 'skipped')
|
||||
|
||||
9
.github/workflows/neon_extra_builds.yml
vendored
9
.github/workflows/neon_extra_builds.yml
vendored
@@ -232,19 +232,12 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
|
||||
|
||||
- name: Run cargo test
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
cargo nextest run $CARGO_FEATURES -j$(nproc)
|
||||
|
||||
# Run separate tests for real S3
|
||||
@@ -385,7 +378,7 @@ jobs:
|
||||
run: make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Produce the build stats
|
||||
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
|
||||
run: cargo build --all --release --timings -j$(nproc)
|
||||
|
||||
- name: Upload the build stats
|
||||
id: upload-stats
|
||||
|
||||
155
.github/workflows/periodic_pagebench.yml
vendored
155
.github/workflows/periodic_pagebench.yml
vendored
@@ -1,155 +0,0 @@
|
||||
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
||||
workflow_dispatch: # Allows manual triggering of the workflow
|
||||
inputs:
|
||||
commit_hash:
|
||||
type: string
|
||||
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
trigger_bench_on_ec2_machine_in_eu_central_1:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
timeout-minutes: 360 # Set the timeout to 6 hours
|
||||
env:
|
||||
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
|
||||
AWS_DEFAULT_REGION : "eu-central-1"
|
||||
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
|
||||
steps:
|
||||
# we don't need the neon source code because we run everything remotely
|
||||
# however we still need the local github actions to run the allure step below
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
|
||||
run: curl https://ifconfig.me
|
||||
|
||||
- name: Start EC2 instance and wait for the instance to boot up
|
||||
run: |
|
||||
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
|
||||
sleep 60 # sleep some time to allow cloudinit and our API server to start up
|
||||
|
||||
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
|
||||
run: |
|
||||
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
|
||||
echo "Public IP of the EC2 instance: $public_ip"
|
||||
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
|
||||
|
||||
- name: Determine commit hash
|
||||
env:
|
||||
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
||||
run: |
|
||||
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
||||
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
||||
else
|
||||
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Start Bench with run_id
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
|
||||
|
||||
- name: Poll Test Status
|
||||
id: poll_step
|
||||
run: |
|
||||
status=""
|
||||
while [[ "$status" != "failure" && "$status" != "success" ]]; do
|
||||
response=$(curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY")
|
||||
echo "Response: $response"
|
||||
set +x
|
||||
status=$(echo $response | jq -r '.status')
|
||||
echo "Test status: $status"
|
||||
if [[ "$status" == "failure" ]]; then
|
||||
echo "Test failed"
|
||||
exit 1 # Fail the job step if status is failure
|
||||
elif [[ "$status" == "success" || "$status" == "null" ]]; then
|
||||
break
|
||||
elif [[ "$status" == "too_many_runs" ]]; then
|
||||
echo "Too many runs already running"
|
||||
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 60 # Poll every 60 seconds
|
||||
done
|
||||
|
||||
- name: Retrieve Test Logs
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/gzip' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
--output "test_log_${GITHUB_RUN_ID}.gz"
|
||||
|
||||
- name: Unzip Test Log and Print it into this job's log
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
|
||||
cat "test_log_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: Create Allure report
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
- name: Cleanup Test Resources
|
||||
if: always()
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d ''
|
||||
|
||||
- name: Stop EC2 instance and wait for the instance to be stopped
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
|
||||
115
.github/workflows/pg-clients.yml
vendored
115
.github/workflows/pg-clients.yml
vendored
@@ -1,115 +0,0 @@
|
||||
name: Test Postgres client libraries
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/pg-clients.yml'
|
||||
- 'test_runner/pg_clients/**'
|
||||
- 'poetry.lock'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: neon-captest-new
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
AWS_DEFAULT_REGION: eu-central-1
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
|
||||
check-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
test-postgres-client-libs:
|
||||
needs: [ build-build-tools-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init --user root
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
|
||||
- name: Run tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: remote
|
||||
test_selection: pg_clients
|
||||
run_in_parallel: false
|
||||
extra_params: -m remote_cluster
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: always()
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: github.event.schedule && failure()
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
98
.github/workflows/pg_clients.yml
vendored
Normal file
98
.github/workflows/pg_clients.yml
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
name: Test Postgres client libraries
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test-postgres-client-libs:
|
||||
# TODO: switch to gen2 runner, requires docker
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
REMOTE_ENV: 1
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Test framework expects we have psql binary;
|
||||
# but since we don't really need it in this test, let's mock it
|
||||
mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
|
||||
./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
-m "remote_cluster" \
|
||||
-rA "test_runner/pg_clients"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
# We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
|
||||
# It will be fixed after switching to gen2 runner
|
||||
- name: Upload python test logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
retention-days: 7
|
||||
name: python-test-pg_clients-${{ runner.os }}-stage-logs
|
||||
path: ${{ env.TEST_OUTPUT }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
@@ -1,5 +1,4 @@
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
# * `-D clippy::todo` - don't let `todo!()` slip into `main`
|
||||
export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo"
|
||||
export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
|
||||
|
||||
117
Cargo.lock
generated
117
Cargo.lock
generated
@@ -1014,9 +1014,6 @@ name = "camino"
|
||||
version = "1.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "camino-tempfile"
|
||||
@@ -1246,7 +1243,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit 0.19.10",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -1362,8 +1359,8 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml 0.7.4",
|
||||
"toml_edit 0.19.10",
|
||||
"toml",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
@@ -1669,9 +1666,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel"
|
||||
version = "2.2.1"
|
||||
version = "2.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||
checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
@@ -1684,12 +1681,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel_derives"
|
||||
version = "2.2.1"
|
||||
version = "2.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59de76a222c2b8059f789cbe07afbfd8deb8c31dd0bc2a21f85e256c1def8259"
|
||||
checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
|
||||
dependencies = [
|
||||
"diesel_table_macro_syntax",
|
||||
"dsl_auto_type",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
@@ -1697,9 +1693,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel_migrations"
|
||||
version = "2.2.0"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6"
|
||||
checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
|
||||
dependencies = [
|
||||
"diesel",
|
||||
"migrations_internals",
|
||||
@@ -1708,9 +1704,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "diesel_table_macro_syntax"
|
||||
version = "0.2.0"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
|
||||
checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
|
||||
dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
@@ -1746,20 +1742,6 @@ dependencies = [
|
||||
"const-random",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dsl_auto_type"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dyn-clone"
|
||||
version = "1.0.14"
|
||||
@@ -3099,19 +3081,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "migrations_internals"
|
||||
version = "2.2.0"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
|
||||
checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"toml 0.8.14",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "migrations_macros"
|
||||
version = "2.2.0"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd"
|
||||
checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
|
||||
dependencies = [
|
||||
"migrations_internals",
|
||||
"proc-macro2",
|
||||
@@ -3591,7 +3573,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit 0.19.10",
|
||||
"toml_edit",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -3674,7 +3656,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit 0.19.10",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"twox-hash",
|
||||
"url",
|
||||
@@ -4020,7 +4002,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4033,7 +4015,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -4052,7 +4034,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4665,7 +4647,6 @@ dependencies = [
|
||||
"futures-util",
|
||||
"http-types",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"itertools",
|
||||
"metrics",
|
||||
@@ -4680,7 +4661,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit 0.19.10",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -5179,7 +5160,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit 0.19.10",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
@@ -5458,9 +5439,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "0.6.6"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
|
||||
checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -6225,7 +6206,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6345,26 +6326,14 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.19.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.8.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.22.14",
|
||||
"toml_edit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.6.6"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
|
||||
checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -6379,20 +6348,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"winnow 0.4.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
|
||||
dependencies = [
|
||||
"indexmap 2.0.1",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"winnow 0.6.13",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6811,7 +6767,6 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
@@ -7376,15 +7331,6 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.6.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winreg"
|
||||
version = "0.50.0"
|
||||
@@ -7421,7 +7367,6 @@ dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64ct",
|
||||
"bytes",
|
||||
"camino",
|
||||
"cc",
|
||||
"chrono",
|
||||
"clap",
|
||||
@@ -7474,7 +7419,7 @@ dependencies = [
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.19.10",
|
||||
"toml_edit",
|
||||
"tonic",
|
||||
"tower",
|
||||
"tracing",
|
||||
|
||||
@@ -42,13 +42,12 @@ ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pagectl \
|
||||
@@ -57,7 +56,6 @@ RUN set -e \
|
||||
--bin storage_controller \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--bin storage_scrubber \
|
||||
--locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
@@ -84,7 +82,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
|
||||
@@ -1,13 +1,5 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Use ARG as a build-time environment variable here to allow.
|
||||
# It's not supposed to be set outside.
|
||||
# Alternatively it can be obtained using the following command
|
||||
# ```
|
||||
# . /etc/os-release && echo "${VERSION_CODENAME}"
|
||||
# ```
|
||||
ARG DEBIAN_VERSION_CODENAME=bullseye
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
@@ -34,6 +26,7 @@ RUN set -e \
|
||||
liblzma-dev \
|
||||
libncurses5-dev \
|
||||
libncursesw5-dev \
|
||||
libpq-dev \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
@@ -74,24 +67,19 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=18
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Install docker
|
||||
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
|
||||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
|
||||
# PostgreSQL 14
|
||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt update \
|
||||
&& apt install -y docker-ce docker-ce-cli \
|
||||
&& apt install -y postgresql-client-14 \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Configure sudo & docker
|
||||
RUN usermod -aG sudo nonroot && \
|
||||
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
|
||||
usermod -aG docker nonroot
|
||||
|
||||
# AWS CLI
|
||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||
&& unzip -q awscliv2.zip \
|
||||
@@ -125,10 +113,10 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Compile and install the static OpenSSL library
|
||||
ENV OPENSSL_VERSION=1.1.1w
|
||||
ENV OPENSSL_VERSION=3.2.2
|
||||
ENV OPENSSL_PREFIX=/usr/local/openssl
|
||||
RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
|
||||
echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
|
||||
cd /tmp && \
|
||||
tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
|
||||
@@ -467,6 +467,31 @@ RUN case "${PG_VERSION}" in \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "kq-imcx-pg-build"
|
||||
# compile kq_imcx extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS kq-imcx-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -815,6 +840,7 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -935,6 +961,7 @@ COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hintplan.patch /ext-src
|
||||
#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
|
||||
@@ -873,8 +873,9 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Wrapped this around `pg_ctl reload`, but right now we don't use
|
||||
// `pg_ctl` for start / stop.
|
||||
// We could've wrapped this around `pg_ctl reload`, but right now we don't use
|
||||
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
|
||||
// have opened connection to Postgres and superuser access.
|
||||
#[instrument(skip_all)]
|
||||
fn pg_reload_conf(&self) -> Result<()> {
|
||||
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
|
||||
|
||||
@@ -83,6 +83,12 @@ pub fn write_postgres_conf(
|
||||
ComputeMode::Replica => {
|
||||
// hot_standby is 'on' by default, but let's be explicit
|
||||
writeln!(file, "hot_standby=on")?;
|
||||
|
||||
// Inform the replica about the primary state
|
||||
// Default is 'false'
|
||||
if let Some(primary_is_running) = spec.primary_is_running {
|
||||
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,11 +17,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
// should be handled gracefully.
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let mut connstr = compute.connstr.clone();
|
||||
connstr
|
||||
.query_pairs_mut()
|
||||
.append_pair("application_name", "compute_activity_monitor");
|
||||
let connstr = connstr.as_str();
|
||||
let connstr = compute.connstr.as_str();
|
||||
|
||||
// During startup and configuration we connect to every Postgres database,
|
||||
// but we don't want to count this as some user activity. So wait until
|
||||
|
||||
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
|
||||
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
|
||||
/// - next line starts with timestamp
|
||||
/// - EOF
|
||||
/// - no new lines were written for the last 100 milliseconds
|
||||
/// - no new lines were written for the last second
|
||||
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
|
||||
let mut lines = tokio::io::BufReader::new(stderr).lines();
|
||||
let timeout_duration = Duration::from_millis(100);
|
||||
|
||||
@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
|
||||
// it's waiting. If the process hasn't started/stopped after 5 seconds,
|
||||
// it prints a notice that it's taking long, but keeps waiting.
|
||||
//
|
||||
const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
|
||||
const RETRY_INTERVAL: Duration = Duration::from_millis(100);
|
||||
const DOT_EVERY_RETRIES: u128 = 10;
|
||||
const NOTICE_AFTER_RETRIES: u128 = 50;
|
||||
const RETRY_UNTIL_SECS: u64 = 10;
|
||||
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
|
||||
const RETRY_INTERVAL_MILLIS: u64 = 100;
|
||||
const DOT_EVERY_RETRIES: u64 = 10;
|
||||
const NOTICE_AFTER_RETRIES: u64 = 50;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
@@ -52,7 +52,6 @@ pub enum InitialPidFile {
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
@@ -60,7 +59,6 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
args: AI,
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
retry_timeout: &Duration,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
@@ -71,7 +69,6 @@ where
|
||||
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
|
||||
EI: IntoIterator<Item = (String, String)>,
|
||||
{
|
||||
let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
|
||||
if !datadir.metadata().context("stat datadir")?.is_dir() {
|
||||
anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
|
||||
}
|
||||
@@ -133,7 +130,7 @@ where
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
for retries in 0..retries {
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started and passed status check, pid: {pid}");
|
||||
@@ -151,7 +148,7 @@ where
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(RETRY_INTERVAL);
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
@@ -160,10 +157,9 @@ where
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!(format!(
|
||||
"{} did not start+pass status checks within {:?} seconds",
|
||||
process_name, retry_timeout
|
||||
));
|
||||
anyhow::bail!(
|
||||
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
|
||||
);
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
@@ -219,7 +215,7 @@ pub fn stop_process(
|
||||
}
|
||||
|
||||
pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
for retries in 0..STOP_RETRIES {
|
||||
for retries in 0..RETRIES {
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
@@ -235,7 +231,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(RETRY_INTERVAL);
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} with pid {pid} failed to stop: {e:#}");
|
||||
@@ -244,10 +240,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!(format!(
|
||||
"{} with pid {} did not stop in {:?} seconds",
|
||||
process_name, pid, STOP_RETRY_TIMEOUT
|
||||
));
|
||||
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
|
||||
@@ -21,8 +21,10 @@ use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
};
|
||||
use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
|
||||
use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
@@ -34,7 +36,6 @@ use std::collections::{BTreeSet, HashMap};
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use url::Host;
|
||||
use utils::{
|
||||
@@ -98,7 +99,7 @@ fn main() -> Result<()> {
|
||||
let subcommand_result = match sub_name {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
|
||||
"start" => rt.block_on(handle_start_all(&env)),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||
@@ -598,9 +599,13 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
Some(("import", import_match)) => {
|
||||
let tenant_id = get_tenant_id(import_match, env)?;
|
||||
let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
|
||||
let branch_name = import_match
|
||||
.get_one::<String>("branch-name")
|
||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||
let name = import_match
|
||||
.get_one::<String>("node-name")
|
||||
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||
let update_catalog = import_match
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
// Parse base inputs
|
||||
let base_tarfile = import_match
|
||||
@@ -627,11 +632,24 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
println!("Importing timeline into pageserver ...");
|
||||
pageserver
|
||||
.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
|
||||
.await?;
|
||||
env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?;
|
||||
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||
|
||||
println!("Creating endpoint for imported timeline ...");
|
||||
cplane.new_endpoint(
|
||||
name,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
None,
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
!update_catalog,
|
||||
)?;
|
||||
println!("Done");
|
||||
}
|
||||
Some(("branch", branch_match)) => {
|
||||
@@ -846,13 +864,20 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? {
|
||||
safekeepers
|
||||
} else {
|
||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||
};
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
let mut safekeepers: Vec<NodeId> = Vec::new();
|
||||
for sk_id in safekeepers_str.split(',').map(str::trim) {
|
||||
let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
|
||||
anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
|
||||
})?);
|
||||
safekeepers.push(sk_id);
|
||||
}
|
||||
safekeepers
|
||||
} else {
|
||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||
};
|
||||
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
@@ -956,10 +981,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = parse_safekeepers(sub_args)?;
|
||||
endpoint.reconfigure(pageservers, None, safekeepers).await?;
|
||||
endpoint.reconfigure(pageservers, None).await?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -981,23 +1003,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Parse --safekeepers as list of safekeeper ids.
|
||||
fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
let mut safekeepers: Vec<NodeId> = Vec::new();
|
||||
for sk_id in safekeepers_str.split(',').map(str::trim) {
|
||||
let sk_id = NodeId(
|
||||
u64::from_str(sk_id)
|
||||
.map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?,
|
||||
);
|
||||
safekeepers.push(sk_id);
|
||||
}
|
||||
Ok(Some(safekeepers))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(ep_subcommand_data) => ep_subcommand_data,
|
||||
@@ -1043,20 +1048,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
))
|
||||
}
|
||||
|
||||
fn get_start_timeout(args: &ArgMatches) -> &Duration {
|
||||
let humantime_duration = args
|
||||
.get_one::<humantime::Duration>("start-timeout")
|
||||
.expect("invalid value for start-timeout");
|
||||
humantime_duration.as_ref()
|
||||
}
|
||||
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(get_start_timeout(subcommand_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1082,7 +1077,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
|
||||
if let Err(e) = pageserver.start().await {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1110,8 +1105,8 @@ async fn handle_storage_controller(
|
||||
) -> Result<()> {
|
||||
let svc = StorageController::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", start_match)) => {
|
||||
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start().await {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1170,10 +1165,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
"start" => {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
|
||||
if let Err(e) = safekeeper
|
||||
.start(extra_opts, get_start_timeout(sub_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1199,10 +1191,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
if let Err(e) = safekeeper
|
||||
.start(extra_opts, get_start_timeout(sub_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1215,18 +1204,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_start_all(
|
||||
env: &local_env::LocalEnv,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
// Endpoints are not started automatically
|
||||
|
||||
broker::start_broker_process(env, retry_timeout).await?;
|
||||
broker::start_broker_process(env).await?;
|
||||
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.start(retry_timeout).await {
|
||||
if let Err(e) = storage_controller.start().await {
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1235,7 +1221,7 @@ async fn handle_start_all(
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver.start(retry_timeout).await {
|
||||
if let Err(e) = pageserver.start().await {
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1244,7 +1230,7 @@ async fn handle_start_all(
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
|
||||
if let Err(e) = safekeeper.start(vec![]).await {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false).await;
|
||||
exit(1);
|
||||
@@ -1304,15 +1290,6 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
let timeout_arg = Arg::new("start-timeout")
|
||||
.long("start-timeout")
|
||||
.short('t')
|
||||
.global(true)
|
||||
.help("timeout until we fail the command, e.g. 30s")
|
||||
.value_parser(value_parser!(humantime::Duration))
|
||||
.default_value("10s")
|
||||
.required(false);
|
||||
|
||||
let branch_name_arg = Arg::new("branch-name")
|
||||
.long("branch-name")
|
||||
.help("Name of the branch to be created or used as an alias for other services")
|
||||
@@ -1481,7 +1458,8 @@ fn cli() -> Command {
|
||||
.about("Import timeline from basebackup directory")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(Arg::new("node-name").long("node-name")
|
||||
.help("Name to assign to the imported timeline"))
|
||||
.arg(Arg::new("base-tarfile")
|
||||
.long("base-tarfile")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
@@ -1497,6 +1475,7 @@ fn cli() -> Command {
|
||||
.arg(Arg::new("end-lsn").long("end-lsn")
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(update_catalog.clone())
|
||||
)
|
||||
).subcommand(
|
||||
Command::new("tenant")
|
||||
@@ -1530,7 +1509,6 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
@@ -1538,15 +1516,13 @@ fn cli() -> Command {
|
||||
)
|
||||
.subcommand(Command::new("restart")
|
||||
.about("Restart local pageserver")
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("storage_controller")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start storage controller")
|
||||
.arg(timeout_arg.clone()))
|
||||
.subcommand(Command::new("start").about("Start storage controller"))
|
||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
)
|
||||
@@ -1558,7 +1534,6 @@ fn cli() -> Command {
|
||||
.about("Start local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg.clone())
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local safekeeper")
|
||||
@@ -1570,7 +1545,6 @@ fn cli() -> Command {
|
||||
.arg(safekeeper_id_arg)
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg)
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
@@ -1601,16 +1575,14 @@ fn cli() -> Command {
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(endpoint_pageserver_id_arg.clone())
|
||||
.arg(safekeepers_arg.clone())
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
.arg(allow_multiple.clone())
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
.arg(endpoint_pageserver_id_arg)
|
||||
.arg(safekeepers_arg)
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
)
|
||||
@@ -1658,7 +1630,6 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("start")
|
||||
.about("Start page server and safekeepers")
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -5,18 +5,13 @@
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub async fn start_broker_process(
|
||||
env: &local_env::LocalEnv,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let broker = &env.broker;
|
||||
let listen_addr = &broker.listen_addr;
|
||||
|
||||
@@ -32,7 +27,6 @@ pub async fn start_broker_process(
|
||||
args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
let url = broker.client_url();
|
||||
let status_url = url.join("status").with_context(|| {
|
||||
|
||||
@@ -499,23 +499,6 @@ impl Endpoint {
|
||||
.join(",")
|
||||
}
|
||||
|
||||
/// Map safekeepers ids to the actual connection strings.
|
||||
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in sk_ids {
|
||||
let sk = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
Ok(safekeeper_connstrings)
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
@@ -540,7 +523,18 @@ impl Endpoint {
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in safekeepers {
|
||||
let sk = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
// if it is present, read it and pass to compute_ctl
|
||||
@@ -598,6 +592,7 @@ impl Endpoint {
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
primary_is_running: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -746,7 +741,6 @@ impl Endpoint {
|
||||
&self,
|
||||
mut pageservers: Vec<(Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
@@ -781,12 +775,6 @@ impl Endpoint {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
if let Some(safekeepers) = safekeepers {
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
spec.safekeeper_connstrings = safekeeper_connstrings;
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
|
||||
@@ -325,16 +325,11 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
|
||||
}
|
||||
|
||||
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
||||
self.pg_dir(pg_version, "bin")
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
|
||||
}
|
||||
|
||||
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
||||
self.pg_dir(pg_version, "lib")
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> PathBuf {
|
||||
|
||||
@@ -17,7 +17,8 @@ use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{
|
||||
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
|
||||
TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -157,8 +158,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
self.start_node(retry_timeout).await
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
self.start_node().await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
|
||||
@@ -213,15 +214,14 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
async fn start_node(&self) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
|
||||
"Starting pageserver node {} at '{}' in {:?}",
|
||||
self.conf.id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir,
|
||||
retry_timeout
|
||||
datadir
|
||||
);
|
||||
io::stdout().flush().context("flush stdout")?;
|
||||
|
||||
@@ -239,7 +239,6 @@ impl PageServerNode {
|
||||
args,
|
||||
self.pageserver_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
let st = self.check_status().await;
|
||||
match st {
|
||||
@@ -396,6 +395,28 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let config = Self::parse_config(settings.clone())?;
|
||||
|
||||
let request = models::TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
||||
generation,
|
||||
config,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
// Placement policy is not meaningful for creations not done via storage controller
|
||||
placement_policy: None,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
Ok(self.http_client.tenant_create(&request).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -112,16 +111,11 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
extra_opts: Vec<String>,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}', retrying for {:?}",
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display(),
|
||||
retry_timeout,
|
||||
self.datadir_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
@@ -206,7 +200,6 @@ impl SafekeeperNode {
|
||||
&args,
|
||||
self.safekeeper_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
match self.check_status().await {
|
||||
Ok(()) => Ok(true),
|
||||
|
||||
@@ -5,11 +5,12 @@ use crate::{
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
|
||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -17,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, str::FromStr, time::Duration};
|
||||
use std::{fs, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -155,16 +156,16 @@ impl StorageController {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres subdirectories, such `bin` and `lib`
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
if tokio::fs::try_exists(&path).await? {
|
||||
return Ok(path);
|
||||
}
|
||||
@@ -172,20 +173,11 @@ impl StorageController {
|
||||
|
||||
// Fall through
|
||||
anyhow::bail!(
|
||||
"Postgres directory '{}' not found in {}",
|
||||
dir_name,
|
||||
self.env.pg_distrib_dir.display(),
|
||||
"Postgres binaries not found in {}",
|
||||
self.env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
self.get_pg_dir("bin").await
|
||||
}
|
||||
|
||||
pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
self.get_pg_dir("lib").await
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
@@ -232,23 +224,18 @@ impl StorageController {
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
@@ -283,12 +270,8 @@ impl StorageController {
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
retry_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
@@ -341,12 +324,8 @@ impl StorageController {
|
||||
&self.env.base_data_dir,
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
use futures::StreamExt;
|
||||
use std::{str::FromStr, time::Duration};
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
|
||||
TenantShardSplitResponse,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -21,7 +21,7 @@ use utils::id::{NodeId, TenantId};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
@@ -110,6 +110,12 @@ enum Command {
|
||||
#[arg(long)]
|
||||
config: String,
|
||||
},
|
||||
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
||||
/// alternative to the storage controller's scheduling optimization behavior.
|
||||
TenantScatter {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Print details about a particular tenant, including all its shards' states.
|
||||
TenantDescribe {
|
||||
#[arg(long)]
|
||||
@@ -336,18 +342,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Command::TenantCreate { tenant_id } => {
|
||||
storcon_client
|
||||
.dispatch(
|
||||
Method::POST,
|
||||
"v1/tenant".to_string(),
|
||||
Some(TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
}),
|
||||
)
|
||||
vps_client
|
||||
.tenant_create(&TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantDelete { tenant_id } => {
|
||||
@@ -496,6 +498,88 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantScatter { tenant_id } => {
|
||||
// Find the shards
|
||||
let locate_response = storcon_client
|
||||
.dispatch::<(), TenantLocateResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = locate_response.shards;
|
||||
|
||||
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
let shard_count = shards.len();
|
||||
for s in shards {
|
||||
let entry = node_to_shards.entry(s.node_id).or_default();
|
||||
entry.push(s.shard_id);
|
||||
}
|
||||
|
||||
// Load list of available nodes
|
||||
let nodes_resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for node in nodes_resp {
|
||||
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
||||
node_to_shards.entry(node.id).or_default();
|
||||
}
|
||||
}
|
||||
|
||||
let max_shard_per_node = shard_count / node_to_shards.len();
|
||||
|
||||
loop {
|
||||
let mut migrate_shard = None;
|
||||
for shards in node_to_shards.values_mut() {
|
||||
if shards.len() > max_shard_per_node {
|
||||
// Pick the emptiest
|
||||
migrate_shard = Some(shards.pop().unwrap());
|
||||
}
|
||||
}
|
||||
let Some(migrate_shard) = migrate_shard else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Pick the emptiest node to migrate to
|
||||
let mut destinations = node_to_shards
|
||||
.iter()
|
||||
.map(|(k, v)| (k, v.len()))
|
||||
.collect::<Vec<_>>();
|
||||
destinations.sort_by_key(|i| i.1);
|
||||
let (destination_node, destination_count) = *destinations.first().unwrap();
|
||||
if destination_count + 1 > max_shard_per_node {
|
||||
// Even the emptiest destination doesn't have space: we're done
|
||||
break;
|
||||
}
|
||||
let destination_node = *destination_node;
|
||||
|
||||
node_to_shards
|
||||
.get_mut(&destination_node)
|
||||
.unwrap()
|
||||
.push(migrate_shard);
|
||||
|
||||
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: migrate_shard,
|
||||
node_id: destination_node,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
||||
}
|
||||
|
||||
// Spread the shards across the nodes
|
||||
}
|
||||
Command::TenantDescribe { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
|
||||
# Example docker compose configuration
|
||||
|
||||
The configuration in this directory is used for testing Neon docker images: it is
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a minature Neon system, use `cargo neon` rather than container images.
|
||||
|
||||
This configuration does not start the storage controller, because the controller
|
||||
needs a way to reconfigure running computes, and no such thing exists in this setup.
|
||||
|
||||
@@ -23,10 +23,11 @@ echo "Page server is ready."
|
||||
echo "Create a tenant and timeline"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
-X PUT
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
-d "{\"new_tenant_id\": \"${tenant_id}\"}"
|
||||
http://pageserver:9898/v1/tenant/
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
|
||||
@@ -1,345 +0,0 @@
|
||||
# Graceful Restarts of Storage Controller Managed Clusters
|
||||
|
||||
## Summary
|
||||
This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
|
||||
It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
|
||||
graceful cluster restarts.
|
||||
|
||||
## Motivation
|
||||
|
||||
Pageserver restarts cause read availablity downtime for tenants.
|
||||
|
||||
For example pageserver-3 @ us-east-1 was unavailable for a randomly
|
||||
picked tenant (which requested on-demand activation) for around 30 seconds
|
||||
during the restart at 2024-04-03 16:37 UTC.
|
||||
|
||||
Note that lots of shutdowns on loaded pageservers do not finish within the
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
||||
|
||||
This problem is not yet very acutely felt in storage controller managed pageservers since
|
||||
tenant density is much lower there. However, we are planning on eventually migrating all
|
||||
pageservers to storage controller management, so it makes sense to solve the issue proactively.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Pageserver re-deployments cause minimal downtime for tenants
|
||||
- The storage controller exposes HTTP API hooks for draining and filling tenant shards
|
||||
from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
|
||||
- The storage controller exposes some HTTP API to cancel draining and filling background operations.
|
||||
- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
|
||||
as usual (with downtime).
|
||||
- Progress of draining/filling is visible through metrics
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Integration with the control plane
|
||||
- Graceful restarts for large non-HA tenants.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
- storage controller
|
||||
- deployment orchestrator (i.e. Ansible)
|
||||
- pageserver (indirectly)
|
||||
|
||||
## Terminology
|
||||
|
||||
** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
|
||||
are distributed across the rest of the cluster.
|
||||
|
||||
** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
|
||||
pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
|
||||
|
||||
** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
|
||||
node is set in the `Paused` policy, no further shards will be scheduled on it.
|
||||
|
||||
** Node ** is a pageserver. Term is used interchangeably in this RFC.
|
||||
|
||||
** Deployment orchestrator ** is a generic term for whatever drives our deployments.
|
||||
Currently, it's an Ansible playbook.
|
||||
|
||||
## Background
|
||||
|
||||
### Storage Controller Basics (skip if already familiar)
|
||||
|
||||
Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
|
||||
|
||||
An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
|
||||
|
||||
### Background Optimizations
|
||||
|
||||
The storage controller performs scheduling optimizations in the background. It will
|
||||
migrate attachments to warm secondaries and replace secondaries in order to balance
|
||||
the cluster out.
|
||||
|
||||
### Reconciliations Concurrency Limiting
|
||||
|
||||
There's a hard limit on the number of reconciles that the storage controller
|
||||
can have in flight at any given time. To get an idea of scales, the limit is
|
||||
128 at the time of writing.
|
||||
|
||||
## Implementation
|
||||
|
||||
Note: this section focuses on the core functionality of the graceful restart process.
|
||||
It doesn't neccesarily describe the most efficient approach. Optimizations are described
|
||||
separately in a later section.
|
||||
|
||||
### Overall Flow
|
||||
|
||||
This section describes how to implement graceful restarts from the perspective
|
||||
of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
|
||||
The orchestrator shall implement the following epilogue and prologue steps for each
|
||||
pageserver restart:
|
||||
|
||||
#### Prologue
|
||||
|
||||
The orchestrator shall first fetch the pageserver node id from the control plane or
|
||||
the pageserver it aims to restart directly. Next, it issues an HTTP request
|
||||
to the storage controller in order to start the drain of said pageserver node.
|
||||
All error responses are retried with a short back-off. When a 202 (Accepted)
|
||||
HTTP code is returned, the drain has started. Now the orchestrator polls the
|
||||
node status endpoint exposed by the storage controller in order to await the
|
||||
end of the drain process. When the `policy` field of the node status response
|
||||
becomes `PauseForRestart`, the drain has completed and the orchestrator can
|
||||
proceed with restarting the pageserver.
|
||||
|
||||
The prologue is subject to an overall timeout. It will have a value in the ballpark
|
||||
of minutes. As storage controller managed pageservers become more loaded this timeout
|
||||
will likely have to increase.
|
||||
|
||||
#### Epilogue
|
||||
|
||||
After restarting the pageserver, the orchestrator issues an HTTP request
|
||||
to the storage controller to kick off the filling process. This API call
|
||||
may be retried for all error codes with a short backoff. This also serves
|
||||
as a synchronization primitive as the fill will be refused if the pageserver
|
||||
has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
|
||||
code is returned, the fill has started. Now the orchestrator polls the node
|
||||
status endpoint exposed by the storage controller in order to await the end of
|
||||
the filling process. When the `policy` field of the node status response becomes
|
||||
`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
|
||||
|
||||
Again, the epilogue is subject to an overall timeout. We can start off with
|
||||
using the same timeout as for the prologue, but can also consider relying on
|
||||
the storage controller's background optimizations with a shorter timeout.
|
||||
|
||||
In the case that the deployment orchestrator times out, it attempts to cancel
|
||||
the fill. This operation shall be retried with a short back-off. If it ultimately
|
||||
fails it will require manual intervention to set the nodes scheduling policy to
|
||||
`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
|
||||
but it constrains the scheduler as mentioned previously.
|
||||
|
||||
### Node Scheduling Policy State Machine
|
||||
|
||||
The state machine below encodes the behaviours discussed above and
|
||||
the various failover situations described in a later section.
|
||||
|
||||
Assuming no failures and/or timeouts the flow should be:
|
||||
`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
|
||||
|
||||
```
|
||||
Operator requested drain
|
||||
+-----------------------------------------+
|
||||
| |
|
||||
+-------+-------+ +-------v-------+
|
||||
| | | |
|
||||
| Pause | +-----------> Draining +----------+
|
||||
| | | | | |
|
||||
+---------------+ | +-------+-------+ |
|
||||
| | |
|
||||
| | |
|
||||
Drain requested| | |
|
||||
| |Drain complete | Drain failed
|
||||
| | | Cancelled/PS reattach/Storcon restart
|
||||
| | |
|
||||
+-------+-------+ | |
|
||||
| | | |
|
||||
+-------------+ Active <-----------+------------------+
|
||||
| | | |
|
||||
Fill requested | +---^---^-------+ |
|
||||
| | | |
|
||||
| | | |
|
||||
| | | |
|
||||
| Fill completed| | |
|
||||
| | |PS reattach |
|
||||
| | |after restart |
|
||||
+-------v-------+ | | +-------v-------+
|
||||
| | | | | |
|
||||
| Filling +---------+ +-----------+PauseForRestart|
|
||||
| | | |
|
||||
+---------------+ +---------------+
|
||||
```
|
||||
|
||||
### Draining/Filling APIs
|
||||
|
||||
The storage controller API to trigger the draining of a given node is:
|
||||
`PUT /v1/control/node/:node_id/{drain,fill}`.
|
||||
|
||||
The following HTTP non-success return codes are used.
|
||||
All of them are safely retriable from the perspective of the storage controller.
|
||||
- 404: Requested node was not found
|
||||
- 503: Requested node is known to the storage controller, but unavailable
|
||||
- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
|
||||
- 409: A {drain, fill} is already in progress. Only one such background operation
|
||||
is allowed per node.
|
||||
|
||||
When the drain is accepted and commenced a 202 HTTP code is returned.
|
||||
|
||||
Drains and fills shall be cancellable by the deployment orchestrator or a
|
||||
human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
|
||||
response is returned when the cancelation is successful. Errors are retriable.
|
||||
|
||||
### Drain Process
|
||||
|
||||
Before accpeting a drain request the following validations is applied:
|
||||
* Ensure that the node is known the storage controller
|
||||
* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
|
||||
* Ensure that another drain or fill is not already running on the node
|
||||
* Ensure that a drain is possible (i.e. check that there is at least one
|
||||
schedulable node to drain to)
|
||||
|
||||
After accepting the drain, the scheduling policy of the node is set to
|
||||
`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
|
||||
This disallows the optimizer from adding or removing shards from the node which
|
||||
is desirable to avoid them racing.
|
||||
|
||||
Next, a separate Tokio task is spawned to manage the draining. For each tenant
|
||||
shard attached to the node being drained, demote the node to a secondary and
|
||||
attempt to schedule the node away. Scheduling might fail due to unsatisfiable
|
||||
constraints, but that is fine. Draining is a best effort process since it might
|
||||
not always be possible to cut over all shards.
|
||||
|
||||
Importantly, this task manages the concurrency of issued reconciles in order to
|
||||
avoid drowning out the target pageservers and to allow other important reconciles
|
||||
to proceed.
|
||||
|
||||
Once the triggered reconciles have finished or timed out, set the node's scheduling
|
||||
policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
|
||||
|
||||
A note on non HA tenants: These tenants do not have secondaries, so by the description
|
||||
above, they would not be migrated. It makes sense to skip them (especially the large ones)
|
||||
since, depending on tenant size, this might be more disruptive than the restart since the
|
||||
pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
|
||||
We can consider expanding to small non-HA tenants in the future.
|
||||
|
||||
### Fill Process
|
||||
|
||||
Before accpeting a fill request the following validations is applied:
|
||||
* Ensure that the node is known the storage controller
|
||||
* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
|
||||
This is the only acceptable policy for the fill starting state. When a node re-attaches,
|
||||
it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
|
||||
`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
|
||||
* Ensure that another drain or fill is not already running on the node
|
||||
|
||||
After accepting the drain, the scheduling policy of the node is set to
|
||||
`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
|
||||
This disallows the optimizer from adding or removing shards from the node which
|
||||
is desirable to avoid them racing.
|
||||
|
||||
Next, a separate Tokio task is spawned to manage the draining. For each tenant
|
||||
shard where the filled node is a secondary, promote the secondary. This is done
|
||||
until we run out of shards or the counts of attached shards become balanced across
|
||||
the cluster.
|
||||
|
||||
Like for draining, the concurrency of spawned reconciles is limited.
|
||||
|
||||
### Failure Modes & Handling
|
||||
|
||||
Failures are generally handled by transition back into the `Active`
|
||||
(neutral) state. This simplifies the implementation greatly at the
|
||||
cost of adding transitions to the state machine. For example, we
|
||||
could detect the `Draining` state upon restart and proceed with a drain,
|
||||
but how should the storage controller know that's what the orchestrator
|
||||
needs still?
|
||||
|
||||
#### Storage Controller Crash
|
||||
|
||||
When the storage controller starts up reset the node scheduling policy
|
||||
of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
|
||||
`Active`. The rationale is that when the storage controller restarts,
|
||||
we have lost context of what the deployment orchestrator wants. It also
|
||||
has the benefit of making things easier to reason about.
|
||||
|
||||
#### Pageserver Crash During Drain
|
||||
|
||||
The pageserver will attempt to re-attach during restart at which
|
||||
point the node scheduling policy will be set back to `Active`, thus
|
||||
reenabling the scheduler to use the node.
|
||||
|
||||
#### Non-drained Pageserver Crash During Drain
|
||||
|
||||
What should happen when a pageserver we are draining to crashes during the
|
||||
process. Two reasonable options are: cancel the drain and focus on the failover
|
||||
*or* do both, but prioritise failover. Since the number of concurrent reconciles
|
||||
produced by drains/fills are limited, we get the later behaviour for free.
|
||||
My suggestion is we take this approach, but the cancellation option is trivial
|
||||
to implement as well.
|
||||
|
||||
#### Pageserver Crash During Fill
|
||||
|
||||
The pageserver will attempt to re-attach during restart at which
|
||||
point the node scheduling policy will be set back to `Active`, thus
|
||||
reenabling the scheduler to use the node.
|
||||
|
||||
#### Pageserver Goes unavailable During Drain/Fill
|
||||
|
||||
The drain and fill jobs handle this by stopping early. When the pageserver
|
||||
is detected as online by storage controller heartbeats, reset its scheduling
|
||||
policy to `Active`. If a restart happens instead, see the pageserver crash
|
||||
failure mode.
|
||||
|
||||
#### Orchestrator Drain Times Out
|
||||
|
||||
Orchestrator will still proceed with the restart.
|
||||
When the pageserver re-attaches, the scheduling policy is set back to
|
||||
`Active`.
|
||||
|
||||
#### Orchestrator Fill Times Out
|
||||
|
||||
Orchestrator will attempt to cancel the fill operation. If that fails,
|
||||
the fill will continue until it quiesces and the node will be left
|
||||
in the `Filling` scheduling policy. This hinders the scheduler, but is
|
||||
otherwise harmless. A human operator can handle this by setting the scheduling
|
||||
policy to `Active`, or we can bake in a fill timeout into the storage controller.
|
||||
|
||||
## Optimizations
|
||||
|
||||
### Location Warmth
|
||||
|
||||
When cutting over to a secondary, the storage controller will wait for it to
|
||||
become "warm" (i.e. download enough of the tenants data). This means that some
|
||||
reconciliations can take significantly longer than others and hold up precious
|
||||
reconciliations units. As an optimization, the drain stage can only cut over
|
||||
tenants that are already "warm". Similarly, the fill stage can prioritise the
|
||||
"warmest" tenants in the fill.
|
||||
|
||||
Given that the number of tenants by the storage controller will be fairly low
|
||||
for the foreseable future, the first implementation could simply query the tenants
|
||||
for secondary status. This doesn't scale well with increasing tenant counts, so
|
||||
eventually we will need new pageserver API endpoints to report the sets of
|
||||
"warm" and "cold" nodes.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### Draining and Filling Purely as Scheduling Constraints
|
||||
|
||||
At its core, the storage controller is a big background loop that detects changes
|
||||
in the environment and reacts on them. One could express draining and filling
|
||||
of nodes purely in terms of constraining the scheduler (as opposed to having
|
||||
such background tasks).
|
||||
|
||||
While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
|
||||
Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
|
||||
an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
|
||||
to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
|
||||
to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
|
||||
|
||||
It would also mean that reconciliations themselves have side effects that persist in the database
|
||||
(persist something to the databse when the drain is done), which I'm not conceptually fond of.
|
||||
|
||||
## Proof of Concept
|
||||
|
||||
This RFC is accompanied by a POC which implements nearly everything mentioned here
|
||||
apart from the optimizations and some of the failure handling:
|
||||
https://github.com/neondatabase/neon/pull/7682
|
||||
@@ -96,6 +96,12 @@ pub struct ComputeSpec {
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
// When we are starting a new replica in hot standby mode,
|
||||
// we need to know if the primary is running.
|
||||
// This is used to determine if replica should wait for
|
||||
// RUNNING_XACTS from primary or not.
|
||||
pub primary_is_running: Option<bool>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -103,10 +103,9 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("Failed to register maxrss_kb int gauge")
|
||||
});
|
||||
|
||||
/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher,
|
||||
/// especially during many concurrent disk operations.
|
||||
pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
|
||||
&[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
|
||||
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
|
||||
pub struct BuildInfo {
|
||||
pub revision: &'static str,
|
||||
|
||||
@@ -11,27 +11,6 @@ use crate::{
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
pub new_tenant_id: TenantShardId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
|
||||
// If omitted, create a single shard with TenantShardId::unsharded()
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
||||
pub shard_parameters: ShardParameters,
|
||||
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub placement_policy: Option<PlacementPolicy>,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
@@ -301,19 +280,4 @@ mod test {
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reject_unknown_field() {
|
||||
let id = TenantId::generate();
|
||||
let create_request = serde_json::json!({
|
||||
"new_tenant_id": id.to_string(),
|
||||
"unknown_field": "unknown_value".to_string(),
|
||||
});
|
||||
let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("unknown field `unknown_field`"),
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
|
||||
/// See [`Key::to_i128`] for more information on the encoding.
|
||||
pub const METADATA_KEY_SIZE: usize = 16;
|
||||
|
||||
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
|
||||
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
|
||||
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
|
||||
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
|
||||
|
||||
@@ -160,9 +160,8 @@ impl Key {
|
||||
key
|
||||
}
|
||||
|
||||
/// Convert a 18B slice to a key. This function should not be used for 16B metadata keys because `field2` is handled differently.
|
||||
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). There are some restrictions on `field2`,
|
||||
/// and therefore not all 18B slices are valid page server keys.
|
||||
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
|
||||
pub fn from_slice(b: &[u8]) -> Self {
|
||||
Key {
|
||||
field1: b[0],
|
||||
@@ -174,7 +173,7 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a key to a 18B slice. This function should not be used for getting a 16B metadata key because `field2` is handled differently.
|
||||
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
|
||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||
buf[0] = self.field1;
|
||||
|
||||
@@ -17,16 +17,6 @@ pub struct KeySpace {
|
||||
pub ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for KeySpace {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[")?;
|
||||
for range in &self.ranges {
|
||||
write!(f, "{}..{},", range.start, range.end)?;
|
||||
}
|
||||
write!(f, "]")
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper type for sparse keyspaces.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct SparseKeySpace(pub KeySpace);
|
||||
|
||||
@@ -25,6 +25,7 @@ use utils::{
|
||||
serde_system_time,
|
||||
};
|
||||
|
||||
use crate::controller_api::PlacementPolicy;
|
||||
use crate::{
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
@@ -228,11 +229,6 @@ pub struct TimelineCreateRequest {
|
||||
pub pg_version: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct LsnLeaseRequest {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantShardSplitRequest {
|
||||
pub new_shard_count: u8,
|
||||
@@ -275,6 +271,44 @@ impl Default for ShardParameters {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
pub new_tenant_id: TenantShardId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
|
||||
// If omitted, create a single shard with TenantShardId::unsharded()
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
||||
pub shard_parameters: ShardParameters,
|
||||
|
||||
// This parameter is only meaningful in requests sent to the storage controller
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub placement_policy: Option<PlacementPolicy>,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLoadRequest {
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantCreateRequest {
|
||||
type Target = TenantConfig;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.config
|
||||
}
|
||||
}
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||
/// simpler types.
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
||||
@@ -437,37 +471,6 @@ pub enum CompactionAlgorithm {
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
PartialEq,
|
||||
Eq,
|
||||
Serialize,
|
||||
Deserialize,
|
||||
strum_macros::FromRepr,
|
||||
strum_macros::EnumString,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum ImageCompressionAlgorithm {
|
||||
/// Disabled for writes, and never decompress during reading.
|
||||
/// Never set this after you've enabled compression once!
|
||||
DisabledNoDecompress,
|
||||
// Disabled for writes, support decompressing during read path
|
||||
Disabled,
|
||||
/// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
|
||||
/// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
|
||||
Zstd {
|
||||
level: Option<i8>,
|
||||
},
|
||||
}
|
||||
|
||||
impl ImageCompressionAlgorithm {
|
||||
pub fn allow_decompression(&self) -> bool {
|
||||
!matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompactionAlgorithmSettings {
|
||||
pub kind: CompactionAlgorithm,
|
||||
@@ -560,6 +563,10 @@ pub struct LocationConfigListResponse {
|
||||
pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TenantCreateResponse(pub TenantId);
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct StatusResponse {
|
||||
pub id: NodeId,
|
||||
@@ -616,6 +623,31 @@ impl TenantConfigRequest {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
#[serde(default)]
|
||||
pub config: TenantAttachConfig,
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
/// its usage inside `TenantAttachRequest`.
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantAttachConfig {
|
||||
#[serde(flatten)]
|
||||
allowing_unknown_fields: TenantConfig,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantAttachConfig {
|
||||
type Target = TenantConfig;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.allowing_unknown_fields
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
|
||||
@@ -634,7 +666,8 @@ pub struct TenantInfo {
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
pub generation: u32,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -679,16 +712,6 @@ pub struct TimelineInfo {
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_logical_size_non_incremental: Option<u64>,
|
||||
|
||||
/// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
|
||||
/// beyond the branch's branch point, we only count up to the branch point.
|
||||
pub pitr_history_size: u64,
|
||||
|
||||
/// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
|
||||
/// ancestor data used by this branch would have been retained anyway). If this is false, then
|
||||
/// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
|
||||
/// otherwise be able to GC.
|
||||
pub within_ancestor_pitr: bool,
|
||||
|
||||
pub timeline_dir_layer_file_size_sum: Option<u64>,
|
||||
|
||||
pub wal_source_connstr: Option<String>,
|
||||
@@ -1471,7 +1494,7 @@ mod tests {
|
||||
state: TenantState::Active,
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
generation: None,
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -1481,8 +1504,7 @@ mod tests {
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
},
|
||||
"generation" : 1
|
||||
}
|
||||
});
|
||||
|
||||
let original_broken = TenantInfo {
|
||||
@@ -1493,7 +1515,7 @@ mod tests {
|
||||
},
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
generation: None,
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
@@ -1507,8 +1529,7 @@ mod tests {
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
},
|
||||
"generation" : 1
|
||||
}
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
@@ -1526,6 +1547,18 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_reject_unknown_field() {
|
||||
let id = TenantId::generate();
|
||||
let create_request = json!({
|
||||
"new_tenant_id": id.to_string(),
|
||||
"unknown_field": "unknown_value".to_string(),
|
||||
});
|
||||
let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("unknown field `unknown_field`"),
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
|
||||
let id = TenantId::generate();
|
||||
let config_request = json!({
|
||||
"tenant_id": id.to_string(),
|
||||
@@ -1537,6 +1570,18 @@ mod tests {
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
|
||||
let attach_request = json!({
|
||||
"config": {
|
||||
"unknown_field": "unknown_value".to_string(),
|
||||
},
|
||||
});
|
||||
let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("unknown field `unknown_field`"),
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
|
||||
// implement and this function is hardly a bottleneck. The function is only called around
|
||||
// establishing a new connection.
|
||||
#[allow(unstable_name_collisions)]
|
||||
config.options(&encode_options(&self.options));
|
||||
config.options(
|
||||
&self
|
||||
.options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>(),
|
||||
);
|
||||
}
|
||||
config
|
||||
}
|
||||
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unstable_name_collisions)]
|
||||
fn encode_options(options: &[String]) -> String {
|
||||
options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_pg_connection_config {
|
||||
use crate::{encode_options, PgConnectionConfig};
|
||||
use crate::PgConnectionConfig;
|
||||
use once_cell::sync::Lazy;
|
||||
use url::Host;
|
||||
|
||||
@@ -257,12 +255,18 @@ mod tests_pg_connection_config {
|
||||
|
||||
#[test]
|
||||
fn test_with_options() {
|
||||
let options = encode_options(&[
|
||||
"hello".to_owned(),
|
||||
"world".to_owned(),
|
||||
"with space".to_owned(),
|
||||
"and \\ backslashes".to_owned(),
|
||||
let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
|
||||
"hello",
|
||||
"world",
|
||||
"with space",
|
||||
"and \\ backslashes",
|
||||
]);
|
||||
assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
|
||||
assert_eq!(cfg.host(), &*STUB_HOST);
|
||||
assert_eq!(cfg.port(), 123);
|
||||
assert_eq!(cfg.raw_address(), "stub.host.example:123");
|
||||
assert_eq!(
|
||||
cfg.to_tokio_postgres_config().get_options(),
|
||||
Some("hello world with\\ space and\\ \\\\\\ backslashes")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,6 +48,15 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
|
||||
#[allow(clippy::identity_op)]
|
||||
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
|
||||
/// Interval of checkpointing metadata file. We should store metadata file to enforce
|
||||
/// predicate that checkpoint.nextXid is larger than any XID in WAL.
|
||||
/// But flushing checkpoint file for each transaction seems to be too expensive,
|
||||
/// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform
|
||||
/// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions.
|
||||
/// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||
/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG.
|
||||
const XID_CHECKPOINT_INTERVAL: u32 = 1024;
|
||||
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
}
|
||||
@@ -322,10 +331,14 @@ impl CheckPoint {
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let new_xid = std::cmp::max(
|
||||
let mut new_xid = std::cmp::max(
|
||||
xid.wrapping_add(1),
|
||||
pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||
);
|
||||
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
||||
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||
new_xid =
|
||||
new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
|
||||
let full_xid = self.nextXid.value;
|
||||
let old_xid = full_xid as u32;
|
||||
if new_xid.wrapping_sub(old_xid) as i32 > 0 {
|
||||
@@ -343,28 +356,6 @@ impl CheckPoint {
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Advance next multi-XID/offset to those given in arguments.
|
||||
///
|
||||
/// It's important that this handles wraparound correctly. This should match the
|
||||
/// MultiXactAdvceNextMXact() logic in PostgreSQL's xlog_redo() function.
|
||||
///
|
||||
/// Returns 'true' if the Checkpoint was updated.
|
||||
pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
|
||||
let mut modified = false;
|
||||
|
||||
if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 {
|
||||
self.nextMulti = multi_xid;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 {
|
||||
self.nextMultiOffset = multi_offset;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
modified
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate new, empty WAL segment, with correct block headers at the first
|
||||
|
||||
@@ -202,53 +202,6 @@ pub fn test_update_next_xid() {
|
||||
assert_eq!(checkpoint.nextXid.value, 2048);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_update_next_multixid() {
|
||||
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
|
||||
|
||||
// simple case
|
||||
checkpoint.nextMulti = 20;
|
||||
checkpoint.nextMultiOffset = 20;
|
||||
checkpoint.update_next_multixid(1000, 2000);
|
||||
assert_eq!(checkpoint.nextMulti, 1000);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 2000);
|
||||
|
||||
// No change
|
||||
checkpoint.update_next_multixid(500, 900);
|
||||
assert_eq!(checkpoint.nextMulti, 1000);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 2000);
|
||||
|
||||
// Close to wraparound, but not wrapped around yet
|
||||
checkpoint.nextMulti = 0xffff0000;
|
||||
checkpoint.nextMultiOffset = 0xfffe0000;
|
||||
checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff);
|
||||
assert_eq!(checkpoint.nextMulti, 0xffff00ff);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
|
||||
|
||||
// Wraparound
|
||||
checkpoint.update_next_multixid(1, 900);
|
||||
assert_eq!(checkpoint.nextMulti, 1);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 900);
|
||||
|
||||
// Wraparound nextMulti to 0.
|
||||
//
|
||||
// It's a bit surprising that nextMulti can be 0, because that's a special value
|
||||
// (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound:
|
||||
// nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips
|
||||
// the 0 and the next multi-xid actually assigned is 1.
|
||||
checkpoint.nextMulti = 0xffff0000;
|
||||
checkpoint.nextMultiOffset = 0xfffe0000;
|
||||
checkpoint.update_next_multixid(0, 0xfffe00ff);
|
||||
assert_eq!(checkpoint.nextMulti, 0);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
|
||||
|
||||
// Wraparound nextMultiOffset to 0
|
||||
checkpoint.update_next_multixid(0, 0);
|
||||
assert_eq!(checkpoint.nextMulti, 0);
|
||||
assert_eq!(checkpoint.nextMultiOffset, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_encode_logical_message() {
|
||||
let expected = [
|
||||
|
||||
@@ -14,9 +14,8 @@ aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
camino.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -34,7 +34,7 @@ use utils::backoff;
|
||||
|
||||
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
|
||||
use crate::{
|
||||
config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing,
|
||||
error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
|
||||
ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
|
||||
@@ -1,264 +0,0 @@
|
||||
use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
|
||||
|
||||
use aws_sdk_s3::types::StorageClass;
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// The storage connection configuration.
|
||||
#[serde(flatten)]
|
||||
pub storage: RemoteStorageKind,
|
||||
/// A common timeout enforced for all requests after concurrency limiter permit has been
|
||||
/// acquired.
|
||||
#[serde(
|
||||
with = "humantime_serde",
|
||||
default = "default_timeout",
|
||||
skip_serializing_if = "is_default_timeout"
|
||||
)]
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
fn default_timeout() -> Duration {
|
||||
RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
|
||||
fn is_default_timeout(d: &Duration) -> bool {
|
||||
*d == RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs { local_path: Utf8PathBuf },
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
/// Azure Blob based storage, storing all files in the container
|
||||
/// specified by the config
|
||||
AzureContainer(AzureConfig),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
#[serde(default = "default_remote_storage_s3_concurrency_limit")]
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
#[serde(default = "default_max_keys_per_list_response")]
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
#[serde(
|
||||
deserialize_with = "deserialize_storage_class",
|
||||
serialize_with = "serialize_storage_class",
|
||||
default
|
||||
)]
|
||||
pub upload_storage_class: Option<StorageClass>,
|
||||
}
|
||||
|
||||
fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn default_max_keys_per_list_response() -> Option<i32> {
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct AzureConfig {
|
||||
/// Name of the container to connect to.
|
||||
pub container_name: String,
|
||||
/// Name of the storage account the container is inside of
|
||||
pub storage_account: Option<String>,
|
||||
/// The region where the bucket is located at.
|
||||
pub container_region: String,
|
||||
/// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
|
||||
pub prefix_in_container: Option<String>,
|
||||
/// Azure has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
|
||||
#[serde(default = "default_remote_storage_azure_concurrency_limit")]
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
#[serde(default = "default_max_keys_per_list_response")]
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
}
|
||||
|
||||
fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
|
||||
NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
|
||||
}
|
||||
|
||||
impl Debug for AzureConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AzureConfig")
|
||||
.field("bucket_name", &self.container_name)
|
||||
.field("storage_account", &self.storage_account)
|
||||
.field("bucket_region", &self.container_region)
|
||||
.field("prefix_in_container", &self.prefix_in_container)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
|
||||
deserializer: D,
|
||||
) -> Result<Option<StorageClass>, D::Error> {
|
||||
Option::<String>::deserialize(deserializer).and_then(|s| {
|
||||
if let Some(s) = s {
|
||||
use serde::de::Error;
|
||||
let storage_class = StorageClass::from_str(&s).expect("infallible");
|
||||
#[allow(deprecated)]
|
||||
if matches!(storage_class, StorageClass::Unknown(_)) {
|
||||
return Err(D::Error::custom(format!(
|
||||
"Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
|
||||
StorageClass::values()
|
||||
)));
|
||||
}
|
||||
Ok(Some(storage_class))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_storage_class<S: serde::Serializer>(
|
||||
val: &Option<StorageClass>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
let val = val.as_ref().map(StorageClass::as_str);
|
||||
Option::<&str>::serialize(&val, serializer)
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
||||
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||
Ok(utils::toml_edit_ext::deserialize_item(toml)?)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
|
||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||
RemoteStorageConfig::from_toml(toml.as_item())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_localfs_config_with_timeout() {
|
||||
let input = "local_path = '.'
|
||||
timeout = '5s'";
|
||||
|
||||
let config = parse(input).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs {
|
||||
local_path: Utf8PathBuf::from(".")
|
||||
},
|
||||
timeout: Duration::from_secs(5)
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_s3_parsing() {
|
||||
let toml = "\
|
||||
bucket_name = 'foo-bar'
|
||||
bucket_region = 'eu-central-1'
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: "foo-bar".into(),
|
||||
bucket_region: "eu-central-1".into(),
|
||||
prefix_in_bucket: None,
|
||||
endpoint: None,
|
||||
concurrency_limit: default_remote_storage_s3_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
upload_storage_class: Some(StorageClass::IntelligentTiering),
|
||||
}),
|
||||
timeout: Duration::from_secs(7)
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_azure_parsing() {
|
||||
let toml = "\
|
||||
container_name = 'foo-bar'
|
||||
container_region = 'westeurope'
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AzureContainer(AzureConfig {
|
||||
container_name: "foo-bar".into(),
|
||||
storage_account: None,
|
||||
container_region: "westeurope".into(),
|
||||
prefix_in_container: None,
|
||||
concurrency_limit: default_remote_storage_azure_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
}),
|
||||
timeout: Duration::from_secs(7)
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -10,7 +10,6 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod azure_blob;
|
||||
mod config;
|
||||
mod error;
|
||||
mod local_fs;
|
||||
mod metrics;
|
||||
@@ -19,10 +18,17 @@ mod simulate_failures;
|
||||
mod support;
|
||||
|
||||
use std::{
|
||||
collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime,
|
||||
collections::HashMap,
|
||||
fmt::Debug,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
pin::Pin,
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{bail, Context};
|
||||
use aws_sdk_s3::types::StorageClass;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use bytes::Bytes;
|
||||
@@ -30,6 +36,7 @@ use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
pub use self::{
|
||||
@@ -38,8 +45,6 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config};
|
||||
|
||||
/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
|
||||
pub use azure_core::Etag;
|
||||
|
||||
@@ -446,7 +451,7 @@ impl GenericRemoteStorage {
|
||||
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
|
||||
let timeout = storage_config.timeout;
|
||||
Ok(match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs { local_path: path } => {
|
||||
RemoteStorageKind::LocalFs(path) => {
|
||||
info!("Using fs root '{path}' as a remote storage");
|
||||
Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
|
||||
}
|
||||
@@ -521,6 +526,262 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
/// A common timeout enforced for all requests after concurrency limiter permit has been
|
||||
/// acquired.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(Utf8PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
/// Azure Blob based storage, storing all files in the container
|
||||
/// specified by the config
|
||||
AzureContainer(AzureConfig),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
pub upload_storage_class: Option<StorageClass>,
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct AzureConfig {
|
||||
/// Name of the container to connect to.
|
||||
pub container_name: String,
|
||||
/// Name of the storage account the container is inside of
|
||||
pub storage_account: Option<String>,
|
||||
/// The region where the bucket is located at.
|
||||
pub container_region: String,
|
||||
/// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
|
||||
pub prefix_in_container: Option<String>,
|
||||
/// Azure has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
}
|
||||
|
||||
impl Debug for AzureConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AzureConfig")
|
||||
.field("bucket_name", &self.container_name)
|
||||
.field("storage_account", &self.storage_account)
|
||||
.field("bucket_region", &self.container_region)
|
||||
.field("prefix_in_container", &self.prefix_in_container)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
||||
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
let bucket_region = toml.get("bucket_region");
|
||||
let container_name = toml.get("container_name");
|
||||
let container_region = toml.get("container_region");
|
||||
|
||||
let use_azure = container_name.is_some() && container_region.is_some();
|
||||
|
||||
let default_concurrency_limit = if use_azure {
|
||||
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
|
||||
} else {
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
|
||||
};
|
||||
let concurrency_limit = NonZeroUsize::new(
|
||||
parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
|
||||
let max_keys_per_list_response =
|
||||
parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
|
||||
.context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
|
||||
.or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
|
||||
|
||||
let endpoint = toml
|
||||
.get("endpoint")
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?;
|
||||
|
||||
let timeout = toml
|
||||
.get("timeout")
|
||||
.map(|timeout| {
|
||||
timeout
|
||||
.as_str()
|
||||
.ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
|
||||
})
|
||||
.transpose()
|
||||
.and_then(|timeout| {
|
||||
timeout
|
||||
.map(humantime::parse_duration)
|
||||
.transpose()
|
||||
.map_err(anyhow::Error::new)
|
||||
})
|
||||
.context("parse timeout")?
|
||||
.unwrap_or(Self::DEFAULT_TIMEOUT);
|
||||
|
||||
if timeout < Duration::from_secs(1) {
|
||||
bail!("timeout was specified as {timeout:?} which is too low");
|
||||
}
|
||||
|
||||
let storage = match (
|
||||
local_path,
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
container_name,
|
||||
container_region,
|
||||
) {
|
||||
// no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
|
||||
(None, None, None, None, None) => return Ok(None),
|
||||
(_, Some(_), None, ..) => {
|
||||
bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
|
||||
}
|
||||
(_, None, Some(_), ..) => {
|
||||
bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
|
||||
}
|
||||
(None, Some(bucket_name), Some(bucket_region), ..) => {
|
||||
RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
||||
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
||||
prefix_in_bucket: toml
|
||||
.get("prefix_in_bucket")
|
||||
.map(|prefix_in_bucket| {
|
||||
parse_toml_string("prefix_in_bucket", prefix_in_bucket)
|
||||
})
|
||||
.transpose()?,
|
||||
endpoint,
|
||||
concurrency_limit,
|
||||
max_keys_per_list_response,
|
||||
upload_storage_class: toml
|
||||
.get("upload_storage_class")
|
||||
.map(|prefix_in_bucket| -> anyhow::Result<_> {
|
||||
let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
|
||||
let storage_class = StorageClass::from_str(&s).expect("infallible");
|
||||
#[allow(deprecated)]
|
||||
if matches!(storage_class, StorageClass::Unknown(_)) {
|
||||
bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
|
||||
}
|
||||
Ok(storage_class)
|
||||
})
|
||||
.transpose()?,
|
||||
})
|
||||
}
|
||||
(_, _, _, Some(_), None) => {
|
||||
bail!("'container_name' option is mandatory if 'container_region' is given ")
|
||||
}
|
||||
(_, _, _, None, Some(_)) => {
|
||||
bail!("'container_name' option is mandatory if 'container_region' is given ")
|
||||
}
|
||||
(None, None, None, Some(container_name), Some(container_region)) => {
|
||||
RemoteStorageKind::AzureContainer(AzureConfig {
|
||||
container_name: parse_toml_string("container_name", container_name)?,
|
||||
storage_account: toml
|
||||
.get("storage_account")
|
||||
.map(|storage_account| {
|
||||
parse_toml_string("storage_account", storage_account)
|
||||
})
|
||||
.transpose()?,
|
||||
container_region: parse_toml_string("container_region", container_region)?,
|
||||
prefix_in_container: toml
|
||||
.get("prefix_in_container")
|
||||
.map(|prefix_in_container| {
|
||||
parse_toml_string("prefix_in_container", prefix_in_container)
|
||||
})
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
max_keys_per_list_response,
|
||||
})
|
||||
}
|
||||
(Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
|
||||
Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
|
||||
),
|
||||
(Some(_), Some(_), ..) => {
|
||||
bail!("'local_path' and 'bucket_name' are mutually exclusive")
|
||||
}
|
||||
(Some(_), _, _, Some(_), Some(_)) => {
|
||||
bail!("local_path and 'container_name' are mutually exclusive")
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(RemoteStorageConfig { storage, timeout }))
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions to parse a toml Item
|
||||
fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
|
||||
where
|
||||
I: TryFrom<i64, Error = E>,
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
let toml_integer = match item.get(name) {
|
||||
Some(item) => item
|
||||
.as_integer()
|
||||
.with_context(|| format!("configure option {name} is not an integer"))?,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
I::try_from(toml_integer)
|
||||
.map(Some)
|
||||
.with_context(|| format!("configure option {name} is too large"))
|
||||
}
|
||||
|
||||
fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
|
||||
let s = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
Ok(s.to_string())
|
||||
}
|
||||
|
||||
struct ConcurrencyLimiter {
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
@@ -588,4 +849,24 @@ mod tests {
|
||||
let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
|
||||
assert_eq!(err.to_string(), "Path \"/\" is not relative");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_localfs_config_with_timeout() {
|
||||
let input = "local_path = '.'
|
||||
timeout = '5s'";
|
||||
|
||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||
|
||||
let config = RemoteStorageConfig::from_toml(toml.as_item())
|
||||
.unwrap()
|
||||
.expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
|
||||
timeout: Duration::from_secs(5)
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,12 +46,12 @@ use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
config::S3Config,
|
||||
error::Cancelled,
|
||||
metrics::{start_counting_cancelled_wait, start_measuring_requests},
|
||||
support::PermitCarrying,
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use crate::metrics::AttemptOutcome;
|
||||
|
||||
@@ -34,10 +34,10 @@ struct SegmentSize {
|
||||
}
|
||||
|
||||
struct SizeAlternatives {
|
||||
/// cheapest alternative if parent is available.
|
||||
// cheapest alternative if parent is available.
|
||||
incremental: SegmentSize,
|
||||
|
||||
/// cheapest alternative if parent node is not available
|
||||
// cheapest alternative if parent node is not available
|
||||
non_incremental: Option<SegmentSize>,
|
||||
}
|
||||
|
||||
|
||||
@@ -3,17 +3,10 @@ use std::fmt::Write;
|
||||
|
||||
const SVG_WIDTH: f32 = 500.0;
|
||||
|
||||
/// Different branch kind for SVG drawing.
|
||||
#[derive(PartialEq)]
|
||||
pub enum SvgBranchKind {
|
||||
Timeline,
|
||||
Lease,
|
||||
}
|
||||
|
||||
struct SvgDraw<'a> {
|
||||
storage: &'a StorageModel,
|
||||
branches: &'a [String],
|
||||
seg_to_branch: &'a [(usize, SvgBranchKind)],
|
||||
seg_to_branch: &'a [usize],
|
||||
sizes: &'a [SegmentSizeResult],
|
||||
|
||||
// layout
|
||||
@@ -49,18 +42,13 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
|
||||
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
|
||||
)?;
|
||||
writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn draw_svg(
|
||||
storage: &StorageModel,
|
||||
branches: &[String],
|
||||
seg_to_branch: &[(usize, SvgBranchKind)],
|
||||
seg_to_branch: &[usize],
|
||||
sizes: &SizeResult,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut draw = SvgDraw {
|
||||
@@ -112,7 +100,7 @@ impl<'a> SvgDraw<'a> {
|
||||
|
||||
// Layout the timelines on Y dimension.
|
||||
// TODO
|
||||
let mut y = 120.0;
|
||||
let mut y = 100.0;
|
||||
let mut branch_y_coordinates = Vec::new();
|
||||
for _branch in self.branches {
|
||||
branch_y_coordinates.push(y);
|
||||
@@ -121,7 +109,7 @@ impl<'a> SvgDraw<'a> {
|
||||
|
||||
// Calculate coordinates for each point
|
||||
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
|
||||
.map(|(seg, (branch_id, _))| {
|
||||
.map(|(seg, branch_id)| {
|
||||
let x = (seg.lsn - min_lsn) as f32 / xscale;
|
||||
let y = branch_y_coordinates[*branch_id];
|
||||
(x, y)
|
||||
@@ -187,22 +175,6 @@ impl<'a> SvgDraw<'a> {
|
||||
|
||||
// draw a snapshot point if it's needed
|
||||
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
|
||||
|
||||
let (_, kind) = &self.seg_to_branch[seg_id];
|
||||
if kind == &SvgBranchKind::Lease {
|
||||
let (x1, y1) = (coord_x, coord_y - 10.0);
|
||||
let (x2, y2) = (coord_x, coord_y + 10.0);
|
||||
|
||||
let style = "stroke-width=\"3\" stroke=\"blue\"";
|
||||
|
||||
writeln!(
|
||||
result,
|
||||
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||
)?;
|
||||
writeln!(result, " <title>leased lsn at {}</title>", seg.lsn)?;
|
||||
writeln!(result, "</line>")?;
|
||||
}
|
||||
|
||||
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
|
||||
writeln!(
|
||||
result,
|
||||
|
||||
@@ -40,7 +40,6 @@ thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
|
||||
@@ -37,6 +37,20 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Op<T, E> {
|
||||
async fn call(&mut self) -> Result<T, E>;
|
||||
}
|
||||
|
||||
impl<T, E, F, Fut> Op<T, E> for F
|
||||
where
|
||||
F: FnMut() -> Fut,
|
||||
Fut: Future<Output = Result<T, E>>,
|
||||
{
|
||||
async fn call(&mut self) -> Result<T, E> {
|
||||
(&mut *self)().await
|
||||
}
|
||||
}
|
||||
|
||||
/// Retries passed operation until one of the following conditions are met:
|
||||
/// - encountered error is considered as permanent (non-retryable)
|
||||
/// - retries have been exhausted
|
||||
@@ -51,8 +65,8 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
|
||||
/// for any other error type. Final failed attempt is logged with `{:?}`.
|
||||
///
|
||||
/// Returns `None` if cancellation was noticed during backoff or the terminal result.
|
||||
pub async fn retry<T, O, F, E>(
|
||||
mut op: O,
|
||||
pub async fn retry<T, E>(
|
||||
mut op: impl Op<T, E>,
|
||||
is_permanent: impl Fn(&E) -> bool,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
@@ -63,8 +77,6 @@ where
|
||||
// Not std::error::Error because anyhow::Error doesnt implement it.
|
||||
// For context see https://github.com/dtolnay/anyhow/issues/63
|
||||
E: Display + Debug + 'static,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
@@ -72,7 +84,7 @@ where
|
||||
return None;
|
||||
}
|
||||
|
||||
let result = op().await;
|
||||
let result = op.call().await;
|
||||
match &result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
|
||||
@@ -9,11 +9,20 @@ use serde::{Deserialize, Serialize};
|
||||
/// numbers are used.
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub enum Generation {
|
||||
// The None Generation is used in the metadata of layers written before generations were
|
||||
// introduced. A running Tenant always has a valid generation, but the layer metadata may
|
||||
// include None generations.
|
||||
// Generations with this magic value will not add a suffix to S3 keys, and will not
|
||||
// be included in persisted index_part.json. This value is only to be used
|
||||
// during migration from pre-generation metadata to generation-aware metadata,
|
||||
// and should eventually go away.
|
||||
//
|
||||
// A special Generation is used rather than always wrapping Generation in an Option,
|
||||
// so that code handling generations doesn't have to be aware of the legacy
|
||||
// case everywhere it touches a generation.
|
||||
None,
|
||||
|
||||
// Generations with this magic value may never be used to construct S3 keys:
|
||||
// we will panic if someone tries to. This is for Tenants in the "Broken" state,
|
||||
// so that we can satisfy their constructor with a Generation without risking
|
||||
// a code bug using it in an S3 write (broken tenants should never write)
|
||||
Broken,
|
||||
Valid(u32),
|
||||
}
|
||||
|
||||
@@ -33,6 +42,11 @@ impl Generation {
|
||||
Self::None
|
||||
}
|
||||
|
||||
// Create a new generation that will panic if you try to use get_suffix
|
||||
pub fn broken() -> Self {
|
||||
Self::Broken
|
||||
}
|
||||
|
||||
pub const fn new(v: u32) -> Self {
|
||||
Self::Valid(v)
|
||||
}
|
||||
@@ -46,6 +60,9 @@ impl Generation {
|
||||
match self {
|
||||
Self::Valid(v) => GenerationFileSuffix(Some(*v)),
|
||||
Self::None => GenerationFileSuffix(None),
|
||||
Self::Broken => {
|
||||
panic!("Tried to use a broken generation");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,6 +86,7 @@ impl Generation {
|
||||
}
|
||||
}
|
||||
Self::None => Self::None,
|
||||
Self::Broken => panic!("Attempted to use a broken generation"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,6 +95,7 @@ impl Generation {
|
||||
match self {
|
||||
Self::Valid(n) => Self::Valid(*n + 1),
|
||||
Self::None => Self::Valid(1),
|
||||
Self::Broken => panic!("Attempted to use a broken generation"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +128,7 @@ impl Serialize for Generation {
|
||||
if let Self::Valid(v) = self {
|
||||
v.serialize(serializer)
|
||||
} else {
|
||||
// We should never be asked to serialize a None. Structures
|
||||
// We should never be asked to serialize a None or Broken. Structures
|
||||
// that include an optional generation should convert None to an
|
||||
// Option<Generation>::None
|
||||
Err(serde::ser::Error::custom(
|
||||
@@ -140,6 +159,9 @@ impl Debug for Generation {
|
||||
Self::None => {
|
||||
write!(f, "<none>")
|
||||
}
|
||||
Self::Broken => {
|
||||
write!(f, "<broken>")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,15 +8,22 @@ use super::error::ApiError;
|
||||
pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<T, ApiError> {
|
||||
json_request_or_empty_body(request)
|
||||
.await?
|
||||
.context("missing request body")
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
|
||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<Option<T>, ApiError> {
|
||||
let body = hyper::body::aggregate(request.body_mut())
|
||||
.await
|
||||
.context("Failed to read request body")
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
if body.remaining() == 0 {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"missing request body"
|
||||
)));
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
|
||||
@@ -24,6 +31,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
serde_path_to_error::deserialize(&mut deser)
|
||||
// intentionally stringify because the debug version is not helpful in python logs
|
||||
.map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
|
||||
.map(Some)
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
|
||||
@@ -94,8 +94,6 @@ pub mod env;
|
||||
|
||||
pub mod poison;
|
||||
|
||||
pub mod toml_edit_ext;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error("item is not a document")]
|
||||
ItemIsNotADocument,
|
||||
#[error(transparent)]
|
||||
Serde(toml_edit::de::Error),
|
||||
}
|
||||
|
||||
pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
|
||||
where
|
||||
T: serde::de::DeserializeOwned,
|
||||
{
|
||||
let document: toml_edit::Document = match item {
|
||||
toml_edit::Item::Table(toml) => toml.clone().into(),
|
||||
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
||||
toml.clone().into_table().into()
|
||||
}
|
||||
_ => return Err(Error::ItemIsNotADocument),
|
||||
};
|
||||
|
||||
toml_edit::de::from_document(document).map_err(Error::Serde)
|
||||
}
|
||||
@@ -1,5 +1,3 @@
|
||||
#![allow(clippy::todo)]
|
||||
|
||||
use std::ffi::CString;
|
||||
|
||||
use crate::{
|
||||
|
||||
@@ -48,7 +48,6 @@
|
||||
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! ```
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
@@ -189,7 +188,6 @@ impl Request {
|
||||
manager
|
||||
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||
.await
|
||||
.context("request_redo")
|
||||
}
|
||||
|
||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
|
||||
@@ -205,6 +205,15 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
|
||||
let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
|
||||
self.request(Method::POST, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// The tenant deletion API can return 202 if deletion is incomplete, or
|
||||
/// 404 if it is complete. Callers are responsible for checking the status
|
||||
/// code and retrying. Error codes other than 404 will return Err().
|
||||
|
||||
@@ -83,18 +83,10 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
||||
|
||||
// The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001
|
||||
|
||||
// Handle generation number `-00000001` part
|
||||
if lsns.last().expect("should").len() == 8 {
|
||||
lsns.pop();
|
||||
}
|
||||
|
||||
// Handle version number `-v1` part
|
||||
if lsns.last().expect("should").starts_with('v') {
|
||||
lsns.pop();
|
||||
}
|
||||
|
||||
if lsns.len() == 1 {
|
||||
lsns.push(lsns[0]);
|
||||
}
|
||||
|
||||
@@ -99,7 +99,11 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
}
|
||||
|
||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||
async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
|
||||
async fn get_holes(
|
||||
path: &Utf8Path,
|
||||
max_holes: usize,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Vec<Hole>> {
|
||||
let file = VirtualFile::open(path, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
|
||||
@@ -57,7 +57,7 @@ pub(crate) enum LayerCmd {
|
||||
},
|
||||
}
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &mut RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
page_cache::init(100);
|
||||
|
||||
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let toml_item = toml_document
|
||||
.get("remote_storage")
|
||||
.expect("need remote_storage");
|
||||
let config = RemoteStorageConfig::from_toml(toml_item)?;
|
||||
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
||||
let cancel = CancellationToken::new();
|
||||
storage
|
||||
|
||||
@@ -60,7 +60,7 @@ pub async fn send_basebackup_tarball<'a, W>(
|
||||
req_lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
ctx: &'a RequestContext,
|
||||
ctx: &'a mut RequestContext,
|
||||
) -> Result<(), BasebackupError>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -141,7 +141,7 @@ where
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
full_backup: bool,
|
||||
ctx: &'a RequestContext,
|
||||
ctx: &'a mut RequestContext,
|
||||
}
|
||||
|
||||
/// A sink that accepts SLRU blocks ordered by key and forwards
|
||||
@@ -348,36 +348,35 @@ where
|
||||
self.add_rel(rel, rel).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (path, content) in self
|
||||
.timeline
|
||||
.list_aux_files(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
));
|
||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||
// replorigin_checkoint is written only on compute shutdown, so it contains
|
||||
// deteriorated values. So we generate our own version of this file for the particular LSN
|
||||
// based on information about replorigins extracted from transaction commit records.
|
||||
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
|
||||
// but now we should handle (skip) it for backward compatibility.
|
||||
continue;
|
||||
}
|
||||
let header = new_tar_header(&path, content.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &*content)
|
||||
for (path, content) in self
|
||||
.timeline
|
||||
.list_aux_files(self.lsn, self.ctx)
|
||||
.await
|
||||
.context("could not add aux file to basebackup tarball")?;
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
));
|
||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||
// replorigin_checkoint is written only on compute shutdown, so it contains
|
||||
// deteriorated values. So we generate our own version of this file for the particular LSN
|
||||
// based on information about replorigins extracted from transaction commit records.
|
||||
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
|
||||
// but now we should handle (skip) it for backward compatibility.
|
||||
continue;
|
||||
}
|
||||
let header = new_tar_header(&path, content.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &*content)
|
||||
.await
|
||||
.context("could not add aux file to basebackup tarball")?;
|
||||
}
|
||||
}
|
||||
|
||||
if min_restart_lsn != Lsn::MAX {
|
||||
info!(
|
||||
"Min restart LSN for logical replication is {}",
|
||||
|
||||
@@ -421,10 +421,6 @@ fn start_pageserver(
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
};
|
||||
|
||||
info!(config=?conf.l0_flush, "using l0_flush config");
|
||||
let l0_flush_global_state =
|
||||
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
@@ -433,7 +429,6 @@ fn start_pageserver(
|
||||
broker_client: broker_client.clone(),
|
||||
remote_storage: remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
},
|
||||
order,
|
||||
shutdown_pageserver.clone(),
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use serde;
|
||||
use serde::de::IntoDeserializer;
|
||||
@@ -30,13 +30,18 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||
use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
|
||||
use crate::{tenant::config::TenantConf, virtual_file};
|
||||
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
|
||||
@@ -50,7 +55,6 @@ pub mod defaults {
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
@@ -91,9 +95,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::DisabledNoDecompress;
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
@@ -163,7 +164,7 @@ pub mod defaults {
|
||||
|
||||
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
|
||||
|
||||
#[remote_storage]
|
||||
[remote_storage]
|
||||
|
||||
"#
|
||||
);
|
||||
@@ -289,16 +290,12 @@ pub struct PageServerConf {
|
||||
|
||||
pub validate_vectored_get: bool,
|
||||
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
|
||||
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
||||
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
|
||||
/// of ephemeral data.
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub l0_flush: L0FlushConfig,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -403,11 +400,7 @@ struct PageServerConfigBuilder {
|
||||
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
image_compression: BuilderValue<ImageCompressionAlgorithm>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
l0_flush: BuilderValue<L0FlushConfig>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -494,10 +487,8 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -681,18 +672,10 @@ impl PageServerConfigBuilder {
|
||||
self.validate_vectored_get = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
|
||||
self.image_compression = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn l0_flush(&mut self, value: L0FlushConfig) {
|
||||
self.l0_flush = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -749,9 +732,7 @@ impl PageServerConfigBuilder {
|
||||
get_impl,
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
l0_flush,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -830,12 +811,21 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_shard_id.to_string())
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's LocationConf be stored.
|
||||
pub(crate) fn tenant_location_config_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Utf8PathBuf {
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
///
|
||||
/// Legacy: superseded by tenant_location_config_path. Eventually
|
||||
/// remove this function.
|
||||
pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
|
||||
}
|
||||
|
||||
pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||
}
|
||||
@@ -870,6 +860,14 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_deleted_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> Utf8PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
@@ -942,7 +940,7 @@ impl PageServerConf {
|
||||
"http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
|
||||
"pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
|
||||
"remote_storage" => {
|
||||
builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
|
||||
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
||||
}
|
||||
"tenant_config" => {
|
||||
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
||||
@@ -970,7 +968,7 @@ impl PageServerConf {
|
||||
builder.metric_collection_endpoint(Some(endpoint));
|
||||
},
|
||||
"metric_collection_bucket" => {
|
||||
builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
|
||||
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
|
||||
}
|
||||
"synthetic_size_calculation_interval" =>
|
||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||
@@ -1028,15 +1026,9 @@ impl PageServerConf {
|
||||
"validate_vectored_get" => {
|
||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||
}
|
||||
"image_compression" => {
|
||||
builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
|
||||
}
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
"l0_flush" => {
|
||||
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1118,10 +1110,8 @@ impl PageServerConf {
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1360,9 +1350,7 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1435,9 +1423,7 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1482,7 +1468,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
@@ -1717,19 +1703,6 @@ threshold = "20m"
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_remote_storage_is_error() {
|
||||
let tempdir = tempdir().unwrap();
|
||||
let (workdir, _) = prepare_fs(&tempdir).unwrap();
|
||||
let input = r#"
|
||||
remote_storage = {}
|
||||
"#;
|
||||
let doc = toml_edit::Document::from_str(input).unwrap();
|
||||
let err = PageServerConf::parse_and_validate(&doc, &workdir)
|
||||
.expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
|
||||
assert!(format!("{err}").contains("remote_storage"), "{err}");
|
||||
}
|
||||
|
||||
fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
|
||||
let tempdir_path = tempdir.path();
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ pub async fn collect_metrics(
|
||||
node_id: NodeId,
|
||||
local_disk_storage: Utf8PathBuf,
|
||||
cancel: CancellationToken,
|
||||
ctx: RequestContext,
|
||||
mut ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if _cached_metric_collection_interval != Duration::ZERO {
|
||||
tracing::warn!(
|
||||
@@ -60,7 +60,7 @@ pub async fn collect_metrics(
|
||||
}
|
||||
|
||||
// spin up background worker that caclulates tenant sizes
|
||||
let worker_ctx =
|
||||
let mut worker_ctx =
|
||||
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
@@ -76,7 +76,7 @@ pub async fn collect_metrics(
|
||||
tenant_manager,
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
&mut worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
@@ -122,7 +122,8 @@ pub async fn collect_metrics(
|
||||
let started_at = Instant::now();
|
||||
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||
let metrics =
|
||||
metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &mut ctx).await;
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
@@ -280,7 +281,7 @@ async fn calculate_synthetic_size_worker(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("starting calculate_synthetic_size_worker");
|
||||
scopeguard::defer! {
|
||||
@@ -340,7 +341,7 @@ async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
}
|
||||
|
||||
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
|
||||
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &mut RequestContext) {
|
||||
const CAUSE: LogicalSizeCalculationCause =
|
||||
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
|
||||
|
||||
|
||||
@@ -184,7 +184,7 @@ impl MetricsKey {
|
||||
pub(super) async fn collect_all_metrics(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cached_metrics: &Cache,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Vec<RawMetric> {
|
||||
use pageserver_api::models::TenantState;
|
||||
|
||||
@@ -220,7 +220,7 @@ pub(super) async fn collect_all_metrics(
|
||||
res
|
||||
}
|
||||
|
||||
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
|
||||
async fn collect<S>(tenants: S, cache: &Cache, ctx: &mut RequestContext) -> Vec<RawMetric>
|
||||
where
|
||||
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
|
||||
{
|
||||
@@ -342,7 +342,7 @@ impl TimelineSnapshot {
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
fn collect(
|
||||
t: &Arc<crate::tenant::Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Option<Self>> {
|
||||
if !t.is_active() {
|
||||
// no collection for broken or stopping needed, we will still keep the cached values
|
||||
|
||||
@@ -97,9 +97,10 @@ pub struct RequestContext {
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
page_content_kind: PageContentKind,
|
||||
pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
|
||||
}
|
||||
|
||||
pub(crate) struct MicrosSpentThrottled(optional_counter::MicroSecondsCounterU32);
|
||||
|
||||
/// The kind of access to the page cache.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||
pub enum PageContentKind {
|
||||
@@ -158,7 +159,7 @@ impl RequestContextBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extend(original: &RequestContext) -> Self {
|
||||
pub fn extend(original: &mut RequestContext) -> Self {
|
||||
Self {
|
||||
// This is like a Copy, but avoid implementing Copy because ordinary users of
|
||||
// RequestContext should always move or ref it.
|
||||
|
||||
@@ -382,6 +382,17 @@ pub enum DeletionQueueError {
|
||||
}
|
||||
|
||||
impl DeletionQueueClient {
|
||||
pub(crate) fn broken() -> Self {
|
||||
// Channels whose receivers are immediately dropped.
|
||||
let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
|
||||
Self {
|
||||
tx,
|
||||
executor_tx,
|
||||
lsn_table: Arc::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// This is cancel-safe. If you drop the future before it completes, the message
|
||||
/// is not pushed, although in the context of the deletion queue it doesn't matter: once
|
||||
/// we decide to do a deletion the decision is always final.
|
||||
@@ -839,9 +850,7 @@ mod test {
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
|
||||
let storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs {
|
||||
local_path: remote_fs_dir.clone(),
|
||||
},
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
@@ -190,7 +190,7 @@ where
|
||||
}
|
||||
} else {
|
||||
// If we failed validation, then do not apply any of the projected updates
|
||||
info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
|
||||
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
|
||||
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
|
||||
}
|
||||
}
|
||||
@@ -225,7 +225,7 @@ where
|
||||
&& (tenant.generation == *validated_generation);
|
||||
|
||||
if !this_list_valid {
|
||||
info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
||||
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
||||
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
|
||||
mutated = true;
|
||||
} else {
|
||||
|
||||
@@ -78,14 +78,29 @@ paths:
|
||||
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried. Deleting
|
||||
a non-existent tenant is considered successful (returns 200).
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant was successfully deleted, or was already not found.
|
||||
"503":
|
||||
description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted)
|
||||
|
||||
"404":
|
||||
description: Tenant not found. This is a success result, equivalent to 200.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Deletion is already in progress, continue polling
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"412":
|
||||
description: Deletion may not proceed, tenant is not in Active state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
|
||||
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
||||
parameters:
|
||||
@@ -236,13 +251,6 @@ paths:
|
||||
type: string
|
||||
format: date-time
|
||||
description: A timestamp to get the LSN
|
||||
- name: with_lease
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: Whether to grant a lease to the corresponding LSN. Default to false.
|
||||
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
@@ -265,19 +273,15 @@ paths:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Obtains a lease for the given LSN.
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required:
|
||||
- lsn
|
||||
properties:
|
||||
lsn:
|
||||
description: A LSN to obtain the lease for.
|
||||
type: string
|
||||
format: hex
|
||||
description: Obtain lease for the given LSN
|
||||
parameters:
|
||||
- name: lsn
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
description: A LSN to obtain the lease for
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
@@ -371,11 +375,62 @@ paths:
|
||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||
"409":
|
||||
description: |
|
||||
The tenant is already being modified, perhaps by a concurrent call to this API
|
||||
The tenant is already known to Pageserver in some way,
|
||||
and hence this `/attach` call has been rejected.
|
||||
|
||||
Some examples of how this can happen:
|
||||
- tenant was created on this pageserver
|
||||
- tenant attachment was started by an earlier call to `/attach`.
|
||||
|
||||
Callers should poll the tenant status's `attachment_status` field,
|
||||
like for status 202. See the longer description for `POST /attach`
|
||||
for details.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory.
|
||||
Files on local disk and remote storage are not affected.
|
||||
|
||||
Future pageserver restarts won't load the data back until `load` is called on such tenant.
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant ignored
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Schedules an operation that attempts to load a tenant from the local disk and
|
||||
synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
|
||||
If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
|
||||
|
||||
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
|
||||
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLoadRequest"
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
@@ -757,6 +812,8 @@ components:
|
||||
For example this can be caused by s3 being unreachable. The retry may be implemented
|
||||
with call to detach, though it would be better to not automate it and inspec failed state
|
||||
manually before proceeding with a retry.
|
||||
|
||||
See the tenant `/attach` endpoint for more information.
|
||||
type: object
|
||||
required:
|
||||
- slug
|
||||
@@ -1029,10 +1086,6 @@ components:
|
||||
kind:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
valid_until:
|
||||
type: string
|
||||
format: date-time
|
||||
description: The expiration time of the granted lease.
|
||||
|
||||
LsnLease:
|
||||
type: object
|
||||
|
||||
@@ -21,8 +21,6 @@ use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
@@ -32,28 +30,32 @@ use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::ShardCount;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::prometheus_metrics_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
@@ -75,12 +77,13 @@ use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::SpawnMode;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
||||
TimelineInfo,
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
@@ -202,6 +205,7 @@ impl From<TenantSlotError> for ApiError {
|
||||
NotFound(tenant_id) => {
|
||||
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||
}
|
||||
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
|
||||
InProgress => {
|
||||
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||
}
|
||||
@@ -228,7 +232,7 @@ impl From<UpsertLocationError> for ApiError {
|
||||
BadRequest(e) => ApiError::BadRequest(e),
|
||||
Unavailable(_) => ApiError::ShuttingDown,
|
||||
e @ InProgress => ApiError::Conflict(format!("{e}")),
|
||||
Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
|
||||
Flush(e) | Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -326,12 +330,18 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTenantError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTenantError::*;
|
||||
impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
|
||||
use crate::tenant::delete::DeleteTenantError::*;
|
||||
match value {
|
||||
Get(g) => ApiError::from(g),
|
||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
|
||||
SlotError(e) => e.into(),
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
@@ -342,7 +352,7 @@ async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
force_await_initial_logical_size: bool,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -371,7 +381,7 @@ async fn build_timeline_info(
|
||||
|
||||
async fn build_timeline_info_common(
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
logical_size_task_priority: tenant::timeline::GetLogicalSizePriority,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
@@ -407,8 +417,6 @@ async fn build_timeline_info_common(
|
||||
|
||||
let walreceiver_status = timeline.walreceiver_status();
|
||||
|
||||
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_shard_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
@@ -429,8 +437,6 @@ async fn build_timeline_info_common(
|
||||
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: None,
|
||||
pitr_history_size,
|
||||
within_ancestor_pitr,
|
||||
timeline_dir_layer_file_size_sum: None,
|
||||
wal_source_connstr,
|
||||
last_received_msg_lsn,
|
||||
@@ -496,7 +502,7 @@ async fn timeline_create_handler(
|
||||
|
||||
let new_timeline_id = request_data.new_timeline_id;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
@@ -521,7 +527,7 @@ async fn timeline_create_handler(
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
request_data.existing_initdb_timeline_id,
|
||||
state.broker_client.clone(),
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -529,7 +535,7 @@ async fn timeline_create_handler(
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(
|
||||
&new_timeline,
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
tenant::timeline::GetLogicalSizePriority::User,
|
||||
)
|
||||
.await
|
||||
@@ -587,7 +593,7 @@ async fn timeline_list_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = state
|
||||
@@ -604,7 +610,7 @@ async fn timeline_list_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
.await
|
||||
@@ -674,7 +680,7 @@ async fn timeline_detail_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// Logical size calculation needs downloading.
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline_info = async {
|
||||
@@ -690,7 +696,7 @@ async fn timeline_detail_handler(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.await
|
||||
.context("get local timeline info")
|
||||
@@ -729,25 +735,18 @@ async fn get_lsn_by_timestamp_handler(
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &mut ctx)
|
||||
.await?;
|
||||
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
struct Result {
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(flatten)]
|
||||
lease: Option<LsnLease>,
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
@@ -755,28 +754,11 @@ async fn get_lsn_by_timestamp_handler(
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
|
||||
let lease = if with_lease {
|
||||
timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
|
||||
.inspect_err(|_| {
|
||||
warn!("fail to grant a lease to {}", lsn);
|
||||
})
|
||||
.ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let result = Result { lsn, kind, lease };
|
||||
let valid_until = result
|
||||
.lease
|
||||
.as_ref()
|
||||
.map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string());
|
||||
let result = Result { lsn, kind };
|
||||
tracing::info!(
|
||||
lsn=?result.lsn,
|
||||
kind=%result.kind,
|
||||
timestamp=%timestamp_raw,
|
||||
valid_until=?valid_until,
|
||||
"lsn_by_timestamp finished"
|
||||
);
|
||||
json_response(StatusCode::OK, result)
|
||||
@@ -804,11 +786,11 @@ async fn get_timestamp_of_lsn_handler(
|
||||
.with_context(|| format!("Invalid LSN: {lsn_str:?}"))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
||||
let result = timeline.get_timestamp_for_lsn(lsn, &mut ctx).await?;
|
||||
|
||||
match result {
|
||||
Some(time) => {
|
||||
@@ -821,6 +803,58 @@ async fn get_timestamp_of_lsn_handler(
|
||||
}
|
||||
}
|
||||
|
||||
async fn tenant_attach_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
let tenant_conf = match &maybe_body {
|
||||
Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
|
||||
None => TenantConfOpt::default(),
|
||||
};
|
||||
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
info!("Handling tenant attach {tenant_id}");
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let shard_params = ShardParameters::default();
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &mut ctx)
|
||||
.await?;
|
||||
|
||||
let Some(tenant) = tenant else {
|
||||
// This should never happen: indicates a bug in upsert_location
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Upsert succeeded but didn't return tenant!"
|
||||
)));
|
||||
};
|
||||
|
||||
// We might have successfully constructed a Tenant, but it could still
|
||||
// end up in a broken state:
|
||||
if let TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} = tenant.current_state()
|
||||
{
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Tenant state is Broken: {reason}"
|
||||
)));
|
||||
}
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn timeline_delete_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -851,6 +885,33 @@ async fn timeline_delete_handler(
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_detach_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
|
||||
|
||||
// This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
state
|
||||
.tenant_manager
|
||||
.detach_tenant(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
detach_ignored.unwrap_or(false),
|
||||
&state.deletion_queue_client,
|
||||
)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_reset_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -860,17 +921,65 @@ async fn tenant_reset_handler(
|
||||
|
||||
let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.tenant_manager
|
||||
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
|
||||
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &mut ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
// The /load request is only usable when control_plane_api is not set. Once it is set, callers
|
||||
// should always use /attach instead.
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
mgr::load_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
state.deletion_queue_client.clone(),
|
||||
&mut ctx,
|
||||
)
|
||||
.instrument(info_span!("load", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_ignore_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::ignore_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("ignore_tenant", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -890,9 +999,7 @@ async fn tenant_list_handler(
|
||||
state: state.clone(),
|
||||
current_physical_size: None,
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: (*gen)
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
generation: (*gen).into(),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
@@ -940,10 +1047,7 @@ async fn tenant_status(
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: tenant
|
||||
.generation()
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
generation: tenant.generation().into(),
|
||||
},
|
||||
walredo: tenant.wal_redo_manager_status(),
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
@@ -967,16 +1071,23 @@ async fn tenant_delete_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
state
|
||||
let status = state
|
||||
.tenant_manager
|
||||
.delete_tenant(tenant_shard_id)
|
||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
// Callers use 404 as success for deletions, for historical reasons.
|
||||
if status == StatusCode::NOT_FOUND {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Deletion complete").into(),
|
||||
));
|
||||
}
|
||||
|
||||
json_response(status, ())
|
||||
}
|
||||
|
||||
/// HTTP endpoint to query the current tenant_size of a tenant.
|
||||
@@ -1009,7 +1120,7 @@ async fn tenant_size_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
@@ -1021,7 +1132,7 @@ async fn tenant_size_handler(
|
||||
retention_period,
|
||||
LogicalSizeCalculationCause::TenantSizeHandler,
|
||||
&cancel,
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
@@ -1082,7 +1193,7 @@ async fn tenant_shard_split_handler(
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
@@ -1095,7 +1206,7 @@ async fn tenant_shard_split_handler(
|
||||
tenant,
|
||||
ShardCount::new(req.new_shard_count),
|
||||
req.new_stripe_size,
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -1196,15 +1307,10 @@ fn synthetic_size_html_response(
|
||||
timeline_map.insert(ti.timeline_id, index);
|
||||
timeline_ids.push(ti.timeline_id.to_string());
|
||||
}
|
||||
let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
|
||||
let seg_to_branch: Vec<usize> = inputs
|
||||
.segments
|
||||
.iter()
|
||||
.map(|seg| {
|
||||
(
|
||||
*timeline_map.get(&seg.timeline_id).unwrap(),
|
||||
seg.kind.into(),
|
||||
)
|
||||
})
|
||||
.map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
|
||||
.collect();
|
||||
|
||||
let svg =
|
||||
@@ -1245,6 +1351,75 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Helper for requests that may take a generation, which is mandatory
|
||||
/// when control_plane_api is set, but otherwise defaults to Generation::none()
|
||||
fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
|
||||
if state.conf.control_plane_api.is_some() {
|
||||
req_gen
|
||||
.map(Generation::new)
|
||||
.ok_or(ApiError::BadRequest(anyhow!(
|
||||
"generation attribute missing"
|
||||
)))
|
||||
} else {
|
||||
// Legacy mode: all tenants operate with no generation
|
||||
Ok(Generation::none())
|
||||
}
|
||||
}
|
||||
|
||||
async fn tenant_create_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
let target_tenant_id = request_data.new_tenant_id;
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let _timer = STORAGE_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
|
||||
.expect("bug")
|
||||
.start_timer();
|
||||
|
||||
let tenant_conf =
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let generation = get_request_generation(state, request_data.generation)?;
|
||||
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let location_conf =
|
||||
LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
|
||||
|
||||
let new_tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
target_tenant_id,
|
||||
location_conf,
|
||||
None,
|
||||
SpawnMode::Create,
|
||||
&mut ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let Some(new_tenant) = new_tenant else {
|
||||
// This should never happen: indicates a bug in upsert_location
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Upsert succeeded but didn't return tenant!"
|
||||
)));
|
||||
};
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
new_tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
|
||||
)
|
||||
}
|
||||
|
||||
async fn get_tenant_config_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1306,7 +1481,7 @@ async fn update_tenant_config_handler(
|
||||
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -1323,7 +1498,7 @@ async fn put_tenant_location_config_handler(
|
||||
let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
|
||||
@@ -1332,7 +1507,7 @@ async fn put_tenant_location_config_handler(
|
||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||
if let Err(e) = state
|
||||
.tenant_manager
|
||||
.detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
|
||||
.detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
@@ -1362,7 +1537,7 @@ async fn put_tenant_location_config_handler(
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
|
||||
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &mut ctx)
|
||||
.await?;
|
||||
let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size());
|
||||
let attached = tenant.is_some();
|
||||
@@ -1537,15 +1712,17 @@ async fn handle_tenant_break(
|
||||
|
||||
// Obtains an lsn lease on the given timeline.
|
||||
async fn lsn_lease_handler(
|
||||
mut request: Request<Body>,
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let lsn: Lsn = parse_query_param(&request, "lsn")?
|
||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
|
||||
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
@@ -1553,7 +1730,7 @@ async fn lsn_lease_handler(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &mut ctx)
|
||||
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
@@ -1570,8 +1747,8 @@ async fn timeline_gc_handler(
|
||||
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &mut ctx).await?;
|
||||
|
||||
json_response(StatusCode::OK, gc_result)
|
||||
}
|
||||
@@ -1594,22 +1771,14 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
||||
if !cfg!(feature = "testing") {
|
||||
return Err(ApiError::InternalServerError(anyhow!(
|
||||
"enhanced_gc_bottom_most_compaction is only available in testing mode"
|
||||
)));
|
||||
}
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
timeline
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.compact(&cancel, flags, &mut ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
if wait_until_uploaded {
|
||||
@@ -1643,7 +1812,7 @@ async fn timeline_checkpoint_handler(
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
@@ -1656,7 +1825,7 @@ async fn timeline_checkpoint_handler(
|
||||
}
|
||||
})?;
|
||||
timeline
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.compact(&cancel, flags, &mut ctx)
|
||||
.await
|
||||
.map_err(|e|
|
||||
match e {
|
||||
@@ -1749,8 +1918,8 @@ async fn timeline_detach_ancestor_handler(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
let mut ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &mut ctx;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
@@ -1834,10 +2003,10 @@ async fn getpage_at_lsn_handler(
|
||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
|
||||
let page = timeline.get(key.0, lsn, &ctx).await?;
|
||||
let page = timeline.get(key.0, lsn, &mut ctx).await?;
|
||||
|
||||
Result::<_, ApiError>::Ok(
|
||||
Response::builder()
|
||||
@@ -1863,11 +2032,11 @@ async fn timeline_collect_keyspace(
|
||||
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let (dense_ks, sparse_ks) = timeline
|
||||
.collect_keyspace(at_lsn, &ctx)
|
||||
.collect_keyspace(at_lsn, &mut ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
@@ -2256,8 +2425,8 @@ async fn list_aux_files(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &mut ctx).await?;
|
||||
json_response(StatusCode::OK, files)
|
||||
}
|
||||
|
||||
@@ -2298,15 +2467,15 @@ async fn ingest_aux_files(
|
||||
let mut modification = timeline.begin_modification(
|
||||
Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
|
||||
);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let mut ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
for (fname, content) in body.aux_files {
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.put_file(&fname, content.as_bytes(), &mut ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
.commit(&mut ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -2548,6 +2717,7 @@ pub fn make_router(
|
||||
api_handler(r, reload_auth_validation_keys_handler)
|
||||
})
|
||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||
.get("/v1/tenant/:tenant_shard_id", |r| {
|
||||
api_handler(r, tenant_status)
|
||||
})
|
||||
@@ -2585,9 +2755,21 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||
api_handler(r, timeline_create_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/attach", |r| {
|
||||
api_handler(r, tenant_attach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||
api_handler(r, tenant_detach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
|
||||
api_handler(r, tenant_reset_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||
api_handler(r, tenant_load_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
api_handler(r, tenant_ignore_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
|
||||
@@ -53,7 +53,7 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
tline: &Timeline,
|
||||
pgdata_path: &Utf8Path,
|
||||
pgdata_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
@@ -121,7 +121,7 @@ async fn import_rel(
|
||||
dboid: Oid,
|
||||
reader: &mut (impl AsyncRead + Unpin),
|
||||
len: usize,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Does it look like a relation file?
|
||||
trace!("importing rel file {}", path.display());
|
||||
@@ -210,7 +210,7 @@ async fn import_slru(
|
||||
path: &Path,
|
||||
reader: &mut (impl AsyncRead + Unpin),
|
||||
len: usize,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("importing slru file {path:?}");
|
||||
|
||||
@@ -268,7 +268,7 @@ async fn import_wal(
|
||||
tline: &Timeline,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
|
||||
|
||||
@@ -346,7 +346,7 @@ pub async fn import_basebackup_from_tar(
|
||||
tline: &Timeline,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<()> {
|
||||
info!("importing base at {base_lsn}");
|
||||
let mut modification = tline.begin_modification(base_lsn);
|
||||
@@ -397,7 +397,7 @@ pub async fn import_wal_from_tar(
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<()> {
|
||||
// Set up walingest mutable state
|
||||
let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
|
||||
@@ -489,7 +489,7 @@ async fn import_file(
|
||||
file_path: &Path,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
len: usize,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Option<ControlFileData>> {
|
||||
let file_name = match file_path.file_name() {
|
||||
Some(name) => name.to_string_lossy(),
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use crate::tenant::ephemeral_file;
|
||||
|
||||
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
#[default]
|
||||
PageCached,
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub(crate) enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
|
||||
impl L0FlushGlobalState {
|
||||
pub fn new(config: L0FlushConfig) -> Self {
|
||||
match config {
|
||||
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
|
||||
L0FlushConfig::Direct { max_concurrency } => {
|
||||
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
||||
Self(Arc::new(Inner::Direct { semaphore }))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn inner(&self) -> &Arc<Inner> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl L0FlushConfig {
|
||||
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
|
||||
use L0FlushConfig::*;
|
||||
match self {
|
||||
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
|
||||
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,6 @@ pub mod deletion_queue;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod l0_flush;
|
||||
pub use pageserver_api::keyspace;
|
||||
pub mod aux_file;
|
||||
pub mod metrics;
|
||||
@@ -114,7 +113,11 @@ pub async fn shutdown_pageserver(
|
||||
}
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config-v1`.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
|
||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||
@@ -133,6 +136,13 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
||||
/// into pageserver's memory before being ignored.
|
||||
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
|
||||
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
|
||||
|
||||
pub fn is_temporary(path: &Utf8Path) -> bool {
|
||||
match path.file_name() {
|
||||
Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
|
||||
|
||||
@@ -8,7 +8,7 @@ use metrics::{
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::{EnumCount, VariantNames};
|
||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use tracing::warn;
|
||||
use utils::id::TimelineId;
|
||||
@@ -53,6 +53,9 @@ pub(crate) enum StorageTimeOperation {
|
||||
|
||||
#[strum(serialize = "find gc cutoffs")]
|
||||
FindGcCutoffs,
|
||||
|
||||
#[strum(serialize = "create tenant")]
|
||||
CreateTenant,
|
||||
}
|
||||
|
||||
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||
@@ -330,7 +333,7 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
|
||||
});
|
||||
|
||||
impl PageCacheMetrics {
|
||||
pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
|
||||
pub(crate) fn for_ctx(&self, ctx: &mut RequestContext) -> &PageCacheMetricsForTaskKind {
|
||||
&self.map[ctx.task_kind()][ctx.page_content_kind()]
|
||||
}
|
||||
}
|
||||
@@ -464,24 +467,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_pitr_history_size",
|
||||
"Data written since PITR cutoff on this timeline",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_archive_size",
|
||||
"Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_standby_horizon",
|
||||
@@ -494,7 +479,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_resident_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem, for attached locations.",
|
||||
"The size of the layer files present in the pageserver's filesystem.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -560,15 +545,6 @@ static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_valid_lsn_lease_count",
|
||||
"The number of valid leases after refreshing gc info.",
|
||||
&["tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -1094,13 +1070,21 @@ pub(crate) mod virtual_file_io_engine {
|
||||
});
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
per_tenant_timeline: Histogram,
|
||||
}
|
||||
|
||||
impl GlobalAndPerTimelineHistogram {
|
||||
fn observe(&self, value: f64) {
|
||||
self.global.observe(value);
|
||||
self.per_tenant_timeline.observe(value);
|
||||
}
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
global_metric: &'a Histogram,
|
||||
|
||||
// Optional because not all op types are tracked per-timeline
|
||||
timeline_metric: Option<&'a Histogram>,
|
||||
|
||||
ctx: &'c RequestContext,
|
||||
h: &'a GlobalAndPerTimelineHistogram,
|
||||
start: std::time::Instant,
|
||||
op: SmgrQueryType,
|
||||
}
|
||||
@@ -1108,35 +1092,11 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(res) => res,
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
|
||||
Lazy::new(|| {
|
||||
Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
|
||||
RateLimit::new(Duration::from_secs(10))
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[self.op];
|
||||
rate_limit.call(|| {
|
||||
warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
self.global_metric.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(timeline_metric) = self.timeline_metric {
|
||||
timeline_metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
self.h.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
@@ -1158,8 +1118,7 @@ pub enum SmgrQueryType {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_metrics: [Histogram; SmgrQueryType::COUNT],
|
||||
per_timeline_getpage: Histogram,
|
||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||
}
|
||||
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
@@ -1237,64 +1196,29 @@ impl SmgrQueryTimePerTimeline {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let global_metrics = std::array::from_fn(|i| {
|
||||
let metrics = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
SMGR_QUERY_TIME_GLOBAL
|
||||
let global = SMGR_QUERY_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
&tenant_id,
|
||||
&shard_slug,
|
||||
&timeline_id,
|
||||
])
|
||||
.unwrap();
|
||||
Self {
|
||||
global_metrics,
|
||||
per_timeline_getpage,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_timer<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + '_> {
|
||||
let global_metric = &self.global_metrics[op as usize];
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
|
||||
Lazy::new(|| {
|
||||
Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
|
||||
RateLimit::new(Duration::from_secs(10))
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[op];
|
||||
rate_limit.call(|| {
|
||||
warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
.unwrap();
|
||||
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
GlobalAndPerTimelineHistogram {
|
||||
global,
|
||||
per_tenant_timeline,
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||
Some(&self.per_timeline_getpage)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Some(GlobalAndPerTimelineHistogramTimer {
|
||||
global_metric,
|
||||
timeline_metric,
|
||||
ctx,
|
||||
});
|
||||
Self { metrics }
|
||||
}
|
||||
pub(crate) fn start_timer<'a>(&'a self, op: SmgrQueryType) -> impl Drop + 'a {
|
||||
let metric = &self.metrics[op as usize];
|
||||
let start = Instant::now();
|
||||
GlobalAndPerTimelineHistogramTimer {
|
||||
h: metric,
|
||||
start,
|
||||
op,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1341,25 +1265,28 @@ mod smgr_query_time_tests {
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.global_metrics[*op as usize].get_sample_count())
|
||||
.map(|op| metrics.metrics[*op as usize].global.get_sample_count())
|
||||
.sum();
|
||||
(global, metrics.per_timeline_getpage.get_sample_count())
|
||||
let per_tenant_timeline: u64 = ops
|
||||
.iter()
|
||||
.map(|op| {
|
||||
metrics.metrics[*op as usize]
|
||||
.per_tenant_timeline
|
||||
.get_sample_count()
|
||||
})
|
||||
.sum();
|
||||
(global, per_tenant_timeline)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
let timer = metrics.start_timer(*op, &ctx);
|
||||
let timer = metrics.start_timer(*op);
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
|
||||
// getpage ops are tracked per-timeline, others aren't
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
} else {
|
||||
assert_eq!(post_per_tenant_timeline, 0);
|
||||
}
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
assert!(post_global > pre_global);
|
||||
}
|
||||
}
|
||||
@@ -1465,45 +1392,6 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||
pub(crate) enum ComputeCommandKind {
|
||||
PageStreamV2,
|
||||
PageStream,
|
||||
Basebackup,
|
||||
Fullbackup,
|
||||
ImportBasebackup,
|
||||
ImportWal,
|
||||
LeaseLsn,
|
||||
Show,
|
||||
}
|
||||
|
||||
pub(crate) struct ComputeCommandCounters {
|
||||
map: EnumMap<ComputeCommandKind, IntCounter>,
|
||||
}
|
||||
|
||||
pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
|
||||
let inner = register_int_counter_vec!(
|
||||
"pageserver_compute_commands",
|
||||
"Number of compute -> pageserver commands processed",
|
||||
&["command"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
ComputeCommandCounters {
|
||||
map: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
|
||||
let command_str: &'static str = command.into();
|
||||
inner.with_label_values(&[command_str])
|
||||
})),
|
||||
}
|
||||
});
|
||||
|
||||
impl ComputeCommandCounters {
|
||||
pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
|
||||
&self.map[command]
|
||||
}
|
||||
}
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
@@ -1713,15 +1601,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_secondary_resident_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem, for secondary locations.",
|
||||
&["tenant_id", "shard_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -2124,8 +2003,6 @@ pub(crate) struct TimelineMetrics {
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
@@ -2134,8 +2011,6 @@ pub(crate) struct TimelineMetrics {
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
/// Number of valid LSN leases.
|
||||
pub valid_lsn_lease_count_gauge: UIntGauge,
|
||||
shutdown: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
@@ -2199,15 +2074,6 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let pitr_history_size = PITR_HISTORY_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let archival_size = TIMELINE_ARCHIVE_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -2243,10 +2109,6 @@ impl TimelineMetrics {
|
||||
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
|
||||
.build(&tenant_id, &shard_id, &timeline_id);
|
||||
|
||||
let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
@@ -2260,8 +2122,6 @@ impl TimelineMetrics {
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
@@ -2271,7 +2131,6 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
valid_lsn_lease_count_gauge,
|
||||
shutdown: std::sync::atomic::AtomicBool::default(),
|
||||
}
|
||||
}
|
||||
@@ -2319,13 +2178,8 @@ impl TimelineMetrics {
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
|
||||
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -2356,12 +2210,14 @@ impl TimelineMetrics {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
for op in SmgrQueryType::iter() {
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
op.into(),
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3032,5 +2888,4 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&tenant_throttling::TIMELINE_GET);
|
||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
||||
}
|
||||
|
||||
@@ -331,7 +331,7 @@ impl PageCache {
|
||||
&self,
|
||||
file_id: FileId,
|
||||
blkno: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
|
||||
.await
|
||||
@@ -422,7 +422,7 @@ impl PageCache {
|
||||
async fn lock_for_read(
|
||||
&self,
|
||||
cache_key: &CacheKey,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
@@ -554,7 +554,7 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
ctx: RequestContext,
|
||||
mut ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -624,7 +624,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &mut ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
span,
|
||||
@@ -634,7 +634,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &mut ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
span,
|
||||
@@ -645,7 +645,7 @@ impl PageServerHandler {
|
||||
// shard_id is filled in by the handler
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
|
||||
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &mut ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
span,
|
||||
@@ -655,7 +655,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &mut ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
span,
|
||||
@@ -665,7 +665,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &mut ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
span,
|
||||
@@ -728,7 +728,7 @@ impl PageServerHandler {
|
||||
base_lsn: Lsn,
|
||||
_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
ctx: RequestContext,
|
||||
mut ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -741,7 +741,7 @@ impl PageServerHandler {
|
||||
.get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
|
||||
.await?;
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &mut ctx)
|
||||
.await?;
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
@@ -766,7 +766,7 @@ impl PageServerHandler {
|
||||
&mut copyin_reader,
|
||||
base_lsn,
|
||||
self.broker_client.clone(),
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -791,7 +791,7 @@ impl PageServerHandler {
|
||||
timeline_id: TimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
mut ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -814,7 +814,7 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
|
||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &mut ctx).await?;
|
||||
info!("wal import complete");
|
||||
|
||||
// Read the end of the tar archive.
|
||||
@@ -867,7 +867,7 @@ impl PageServerHandler {
|
||||
request_lsn: Lsn,
|
||||
not_modified_since: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
@@ -926,7 +926,7 @@ impl PageServerHandler {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -958,12 +958,12 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamExistsRequest,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetRelExists);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
@@ -990,13 +990,13 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamNblocksRequest,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetRelSize);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
@@ -1023,13 +1023,13 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamDbSizeRequest,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetDbSize);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
@@ -1173,7 +1173,7 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamGetPageRequest,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = match self.get_cached_timeline_for_page(req) {
|
||||
Ok(tl) => {
|
||||
@@ -1206,7 +1206,7 @@ impl PageServerHandler {
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
@@ -1233,13 +1233,13 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamGetSlruSegmentRequest,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetSlruSegment);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
@@ -1275,7 +1275,7 @@ impl PageServerHandler {
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -1534,7 +1534,7 @@ where
|
||||
|
||||
fail::fail_point!("ps::connection-start::process-query");
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
let mut ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
|
||||
@@ -1554,10 +1554,6 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStreamV2)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
@@ -1583,10 +1579,6 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStream)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
@@ -1613,10 +1605,6 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
@@ -1636,7 +1624,7 @@ where
|
||||
}
|
||||
};
|
||||
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&mut ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
@@ -1646,7 +1634,7 @@ where
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -1656,6 +1644,48 @@ where
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for get_last_record_rlsn command"
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
async {
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::text_col(b"prev_lsn"),
|
||||
RowDescriptor::text_col(b"last_lsn"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(end_of_timeline.prev.to_string().as_bytes()),
|
||||
Some(end_of_timeline.last.to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
.instrument(info_span!(
|
||||
"handle_get_last_record_lsn",
|
||||
shard_id = tracing::field::Empty
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
||||
if params.len() < 2 {
|
||||
@@ -1693,10 +1723,6 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Fullbackup)
|
||||
.inc();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
@@ -1706,7 +1732,7 @@ where
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
&ctx,
|
||||
&mut ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -1745,10 +1771,6 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::ImportBasebackup)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_import_basebackup(
|
||||
pgb,
|
||||
@@ -1796,10 +1818,6 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::ImportWal)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
|
||||
.await
|
||||
@@ -1837,16 +1855,12 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::LeaseLsn)
|
||||
.inc();
|
||||
|
||||
// The caller is responsible for providing correct lsn.
|
||||
let lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &mut ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
@@ -1872,10 +1886,6 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Show)
|
||||
.inc();
|
||||
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
|
||||
@@ -188,7 +188,7 @@ impl Timeline {
|
||||
tag: RelTag,
|
||||
blknum: BlockNumber,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(
|
||||
@@ -218,7 +218,7 @@ impl Timeline {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<usize, PageReconstructError> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
@@ -236,7 +236,7 @@ impl Timeline {
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(
|
||||
@@ -272,7 +272,7 @@ impl Timeline {
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(
|
||||
@@ -307,7 +307,7 @@ impl Timeline {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
@@ -335,7 +335,7 @@ impl Timeline {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let n_blocks = self
|
||||
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
|
||||
@@ -357,7 +357,7 @@ impl Timeline {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
self.get(key, lsn, ctx).await
|
||||
@@ -369,7 +369,7 @@ impl Timeline {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
@@ -382,7 +382,7 @@ impl Timeline {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
@@ -408,7 +408,7 @@ impl Timeline {
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
pausable_failpoint!("find-lsn-for-timestamp-pausable");
|
||||
|
||||
@@ -499,7 +499,7 @@ impl Timeline {
|
||||
probe_lsn: Lsn,
|
||||
found_smaller: &mut bool,
|
||||
found_larger: &mut bool,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
if timestamp >= search_timestamp {
|
||||
@@ -519,7 +519,7 @@ impl Timeline {
|
||||
pub(crate) async fn get_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Option<TimestampTz>, PageReconstructError> {
|
||||
let mut max: Option<TimestampTz> = None;
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
@@ -542,7 +542,7 @@ impl Timeline {
|
||||
async fn map_all_timestamps<T: Default>(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||
) -> Result<T, PageReconstructError> {
|
||||
for segno in self
|
||||
@@ -575,7 +575,7 @@ impl Timeline {
|
||||
pub(crate) async fn get_slru_keyspace(
|
||||
&self,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<KeySpace, PageReconstructError> {
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
|
||||
@@ -604,7 +604,7 @@ impl Timeline {
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
@@ -621,7 +621,7 @@ impl Timeline {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
@@ -632,7 +632,7 @@ impl Timeline {
|
||||
pub(crate) async fn list_dbdirs(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
@@ -647,7 +647,7 @@ impl Timeline {
|
||||
&self,
|
||||
xid: TransactionId,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = twophase_file_key(xid);
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
@@ -657,7 +657,7 @@ impl Timeline {
|
||||
pub(crate) async fn list_twophase_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashSet<TransactionId>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||
@@ -671,7 +671,7 @@ impl Timeline {
|
||||
pub(crate) async fn get_control_file(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
||||
}
|
||||
@@ -679,7 +679,7 @@ impl Timeline {
|
||||
pub(crate) async fn get_checkpoint(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
@@ -687,7 +687,7 @@ impl Timeline {
|
||||
async fn list_aux_files_v1(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get(AUX_FILES_KEY, lsn, ctx).await {
|
||||
Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
|
||||
@@ -705,7 +705,7 @@ impl Timeline {
|
||||
async fn list_aux_files_v2(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
|
||||
@@ -729,7 +729,7 @@ impl Timeline {
|
||||
pub(crate) async fn trigger_aux_file_size_computation(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
|
||||
@@ -741,7 +741,7 @@ impl Timeline {
|
||||
pub(crate) async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
@@ -779,7 +779,7 @@ impl Timeline {
|
||||
pub(crate) async fn get_replorigins(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
|
||||
@@ -809,7 +809,7 @@ impl Timeline {
|
||||
pub(crate) async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
|
||||
@@ -845,7 +845,7 @@ impl Timeline {
|
||||
pub(crate) async fn collect_keyspace(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeySpaceAccum::new();
|
||||
@@ -1145,7 +1145,7 @@ impl<'a> DatadirModification<'a> {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
@@ -1182,7 +1182,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
xid: TransactionId,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
@@ -1229,7 +1229,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let total_blocks = self
|
||||
.tline
|
||||
@@ -1266,7 +1266,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), RelationError> {
|
||||
if rel.relnode == 0 {
|
||||
return Err(RelationError::InvalidRelnode);
|
||||
@@ -1328,7 +1328,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
if self
|
||||
@@ -1362,7 +1362,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
@@ -1384,7 +1384,11 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Drop a relation.
|
||||
pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_drop(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
// Remove it from the directory entry
|
||||
@@ -1420,7 +1424,7 @@ impl<'a> DatadirModification<'a> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
@@ -1466,7 +1470,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
@@ -1499,7 +1503,7 @@ impl<'a> DatadirModification<'a> {
|
||||
pub async fn drop_twophase_file(
|
||||
&mut self,
|
||||
xid: TransactionId,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
@@ -1538,7 +1542,7 @@ impl<'a> DatadirModification<'a> {
|
||||
&mut self,
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let switch_policy = self.tline.get_switch_aux_file_policy();
|
||||
|
||||
@@ -1731,7 +1735,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub async fn flush(&mut self, ctx: &mut RequestContext) -> anyhow::Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -1777,7 +1781,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// underlying timeline.
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub async fn commit(&mut self, ctx: &mut RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -1828,7 +1832,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
async fn get(&self, key: Key, ctx: &mut RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
// Have we already updated the same key? Read the latest pending updated
|
||||
// version in that case.
|
||||
//
|
||||
@@ -1895,7 +1899,7 @@ impl<'a> Version<'a> {
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
key: Key,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match self {
|
||||
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,20 +6,13 @@
|
||||
//! is written as a one byte. If it's larger than that, the length
|
||||
//! is written as a four-byte integer, in big-endian, with the high
|
||||
//! bit set. This way, we can detect whether it's 1- or 4-byte header
|
||||
//! by peeking at the first byte. For blobs larger than 128 bits,
|
||||
//! we also specify three reserved bits, only one of the three bit
|
||||
//! patterns is currently in use (0b011) and signifies compression
|
||||
//! with zstd.
|
||||
//! by peeking at the first byte.
|
||||
//!
|
||||
//! len < 128: 0XXXXXXX
|
||||
//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use async_compression::Level;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
@@ -33,7 +26,7 @@ impl<'a> BlockCursor<'a> {
|
||||
pub async fn read_blob(
|
||||
&self,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_blob_into_buf(offset, &mut buf, ctx).await?;
|
||||
@@ -45,7 +38,7 @@ impl<'a> BlockCursor<'a> {
|
||||
&self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut blknum = (offset / PAGE_SZ as u64) as u32;
|
||||
let mut off = (offset % PAGE_SZ as u64) as usize;
|
||||
@@ -73,37 +66,12 @@ impl<'a> BlockCursor<'a> {
|
||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||
off += 4;
|
||||
}
|
||||
let bit_mask = if self.read_compressed {
|
||||
!LEN_COMPRESSION_BIT_MASK
|
||||
} else {
|
||||
0x7f
|
||||
};
|
||||
len_buf[0] &= bit_mask;
|
||||
len_buf[0] &= 0x7f;
|
||||
u32::from_be_bytes(len_buf) as usize
|
||||
};
|
||||
let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
|
||||
|
||||
let mut tmp_buf = Vec::new();
|
||||
let buf_to_write;
|
||||
let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
|
||||
if compression_bits > BYTE_UNCOMPRESSED {
|
||||
warn!("reading key above future limit ({len} bytes)");
|
||||
}
|
||||
buf_to_write = dstbuf;
|
||||
None
|
||||
} else if compression_bits == BYTE_ZSTD {
|
||||
buf_to_write = &mut tmp_buf;
|
||||
Some(dstbuf)
|
||||
} else {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("invalid compression byte {compression_bits:x}"),
|
||||
);
|
||||
return Err(error);
|
||||
};
|
||||
|
||||
buf_to_write.clear();
|
||||
buf_to_write.reserve(len);
|
||||
dstbuf.clear();
|
||||
dstbuf.reserve(len);
|
||||
|
||||
// Read the payload
|
||||
let mut remain = len;
|
||||
@@ -117,35 +85,14 @@ impl<'a> BlockCursor<'a> {
|
||||
page_remain = PAGE_SZ;
|
||||
}
|
||||
let this_blk_len = min(remain, page_remain);
|
||||
buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
|
||||
dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
|
||||
remain -= this_blk_len;
|
||||
off += this_blk_len;
|
||||
}
|
||||
|
||||
if let Some(dstbuf) = compression {
|
||||
if compression_bits == BYTE_ZSTD {
|
||||
let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
|
||||
decoder.write_all(buf_to_write).await?;
|
||||
decoder.flush().await?;
|
||||
} else {
|
||||
unreachable!("already checked above")
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Reserved bits for length and compression
|
||||
const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
||||
|
||||
/// The maximum size of blobs we support. The highest few bits
|
||||
/// are reserved for compression and other further uses.
|
||||
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
|
||||
|
||||
const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||
const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||
|
||||
/// A wrapper of `VirtualFile` that allows users to write blobs.
|
||||
///
|
||||
/// If a `BlobWriter` is dropped, the internal buffer will be
|
||||
@@ -183,7 +130,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
|
||||
let nbytes = match res {
|
||||
@@ -196,7 +143,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
|
||||
#[inline(always)]
|
||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
|
||||
pub async fn flush_buffer(&mut self, ctx: &mut RequestContext) -> Result<(), Error> {
|
||||
let buf = std::mem::take(&mut self.buf);
|
||||
let (mut buf, res) = self.inner.write_all(buf, ctx).await;
|
||||
res?;
|
||||
@@ -219,7 +166,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
if !BUFFERED {
|
||||
assert!(self.buf.is_empty());
|
||||
@@ -271,23 +218,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
self.write_blob_maybe_compressed(
|
||||
srcbuf,
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::DisabledNoDecompress,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
/// which can be used to retrieve the data later.
|
||||
pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
algorithm: ImageCompressionAlgorithm,
|
||||
ctx: &mut RequestContext,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
let offset = self.offset;
|
||||
|
||||
@@ -295,61 +226,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
|
||||
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
||||
io_buf.clear();
|
||||
let mut compressed_buf = None;
|
||||
let ((io_buf, hdr_res), srcbuf) = async {
|
||||
let (io_buf, hdr_res) = async {
|
||||
if len < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
io_buf.put_u8(len as u8);
|
||||
(
|
||||
self.write_all(io_buf, ctx).await,
|
||||
srcbuf.slice_full().into_inner(),
|
||||
)
|
||||
self.write_all(io_buf, ctx).await
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if len > MAX_SUPPORTED_LEN {
|
||||
if len > 0x7fff_ffff {
|
||||
return (
|
||||
(
|
||||
io_buf,
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
)),
|
||||
),
|
||||
srcbuf.slice_full().into_inner(),
|
||||
io_buf,
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
)),
|
||||
);
|
||||
}
|
||||
let (high_bit_mask, len_written, srcbuf) = match algorithm {
|
||||
ImageCompressionAlgorithm::Zstd { level } => {
|
||||
let mut encoder = if let Some(level) = level {
|
||||
async_compression::tokio::write::ZstdEncoder::with_quality(
|
||||
Vec::new(),
|
||||
Level::Precise(level.into()),
|
||||
)
|
||||
} else {
|
||||
async_compression::tokio::write::ZstdEncoder::new(Vec::new())
|
||||
};
|
||||
let slice = srcbuf.slice_full();
|
||||
encoder.write_all(&slice[..]).await.unwrap();
|
||||
encoder.shutdown().await.unwrap();
|
||||
let compressed = encoder.into_inner();
|
||||
if compressed.len() < len {
|
||||
let compressed_len = compressed.len();
|
||||
compressed_buf = Some(compressed);
|
||||
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
||||
} else {
|
||||
(BYTE_UNCOMPRESSED, len, slice.into_inner())
|
||||
}
|
||||
}
|
||||
ImageCompressionAlgorithm::Disabled
|
||||
| ImageCompressionAlgorithm::DisabledNoDecompress => {
|
||||
(BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
|
||||
}
|
||||
};
|
||||
let mut len_buf = (len_written as u32).to_be_bytes();
|
||||
assert_eq!(len_buf[0] & 0xf0, 0);
|
||||
len_buf[0] |= high_bit_mask;
|
||||
if len > 0x0fff_ffff {
|
||||
tracing::warn!("writing blob above future limit ({len} bytes)");
|
||||
}
|
||||
let mut len_buf = (len as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
(self.write_all(io_buf, ctx).await, srcbuf)
|
||||
self.write_all(io_buf, ctx).await
|
||||
}
|
||||
}
|
||||
.await;
|
||||
@@ -358,12 +257,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
Ok(_) => (),
|
||||
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
||||
}
|
||||
let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
|
||||
let (_buf, res) = self.write_all(compressed_buf, ctx).await;
|
||||
(Slice::into_inner(srcbuf.slice(..)), res)
|
||||
} else {
|
||||
self.write_all(srcbuf, ctx).await
|
||||
};
|
||||
let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
|
||||
(srcbuf, res.map(|_| offset))
|
||||
}
|
||||
}
|
||||
@@ -373,7 +267,7 @@ impl BlobWriter<true> {
|
||||
///
|
||||
/// This function flushes the internal buffer before giving access
|
||||
/// to the underlying `VirtualFile`.
|
||||
pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
|
||||
pub async fn into_inner(mut self, ctx: &mut RequestContext) -> Result<VirtualFile, Error> {
|
||||
self.flush_buffer(ctx).await?;
|
||||
Ok(self.inner)
|
||||
}
|
||||
@@ -401,13 +295,6 @@ mod tests {
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
|
||||
round_trip_test_compressed::<BUFFERED>(blobs, false).await
|
||||
}
|
||||
|
||||
async fn round_trip_test_compressed<const BUFFERED: bool>(
|
||||
blobs: &[Vec<u8>],
|
||||
compression: bool,
|
||||
) -> Result<(), Error> {
|
||||
let temp_dir = camino_tempfile::tempdir()?;
|
||||
let pathbuf = temp_dir.path().join("file");
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
@@ -418,16 +305,7 @@ mod tests {
|
||||
let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = if compression {
|
||||
wtr.write_blob_maybe_compressed(
|
||||
blob.clone(),
|
||||
&ctx,
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
wtr.write_blob(blob.clone(), &ctx).await
|
||||
};
|
||||
let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
|
||||
let offs = res?;
|
||||
offsets.push(offs);
|
||||
}
|
||||
@@ -441,7 +319,7 @@ mod tests {
|
||||
|
||||
let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
|
||||
let rdr = BlockReaderRef::VirtualFile(&file);
|
||||
let rdr = BlockCursor::new_with_compression(rdr, compression);
|
||||
let rdr = BlockCursor::new(rdr);
|
||||
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
||||
let blob_read = rdr.read_blob(*offset, &ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -475,8 +353,6 @@ mod tests {
|
||||
];
|
||||
round_trip_test::<false>(blobs).await?;
|
||||
round_trip_test::<true>(blobs).await?;
|
||||
round_trip_test_compressed::<false>(blobs, true).await?;
|
||||
round_trip_test_compressed::<true>(blobs, true).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -485,15 +361,10 @@ mod tests {
|
||||
let blobs = &[
|
||||
b"test".to_vec(),
|
||||
random_array(10 * PAGE_SZ),
|
||||
b"hello".to_vec(),
|
||||
random_array(66 * PAGE_SZ),
|
||||
vec![0xf3; 24 * PAGE_SZ],
|
||||
b"foobar".to_vec(),
|
||||
];
|
||||
round_trip_test::<false>(blobs).await?;
|
||||
round_trip_test::<true>(blobs).await?;
|
||||
round_trip_test_compressed::<false>(blobs, true).await?;
|
||||
round_trip_test_compressed::<true>(blobs, true).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ where
|
||||
pub enum BlockLease<'a> {
|
||||
PageReadGuard(PageReadGuard<'static>),
|
||||
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
|
||||
Slice(&'a [u8; PAGE_SZ]),
|
||||
#[cfg(test)]
|
||||
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
||||
#[cfg(test)]
|
||||
@@ -64,7 +63,6 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
match self {
|
||||
BlockLease::PageReadGuard(v) => v.deref(),
|
||||
BlockLease::EphemeralFileMutableTail(v) => v,
|
||||
BlockLease::Slice(v) => v,
|
||||
#[cfg(test)]
|
||||
BlockLease::Arc(v) => v.deref(),
|
||||
#[cfg(test)]
|
||||
@@ -83,7 +81,6 @@ pub(crate) enum BlockReaderRef<'a> {
|
||||
FileBlockReader(&'a FileBlockReader<'a>),
|
||||
EphemeralFile(&'a EphemeralFile),
|
||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||
Slice(&'a [u8]),
|
||||
#[cfg(test)]
|
||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||
#[cfg(test)]
|
||||
@@ -95,14 +92,13 @@ impl<'a> BlockReaderRef<'a> {
|
||||
async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
use BlockReaderRef::*;
|
||||
match self {
|
||||
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
||||
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
|
||||
Adapter(r) => r.read_blk(blknum, ctx).await,
|
||||
Slice(s) => Self::read_blk_slice(s, blknum),
|
||||
#[cfg(test)]
|
||||
TestDisk(r) => r.read_blk(blknum),
|
||||
#[cfg(test)]
|
||||
@@ -111,24 +107,6 @@ impl<'a> BlockReaderRef<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BlockReaderRef<'a> {
|
||||
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
|
||||
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
|
||||
let end = start.checked_add(PAGE_SZ).unwrap();
|
||||
if end > slice.len() {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
format!("slice too short, len={} end={}", slice.len(), end),
|
||||
));
|
||||
}
|
||||
let slice = &slice[start..end];
|
||||
let page_sized: &[u8; PAGE_SZ] = slice
|
||||
.try_into()
|
||||
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
|
||||
Ok(BlockLease::Slice(page_sized))
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||
///
|
||||
@@ -149,24 +127,16 @@ impl<'a> BlockReaderRef<'a> {
|
||||
/// ```
|
||||
///
|
||||
pub struct BlockCursor<'a> {
|
||||
pub(super) read_compressed: bool,
|
||||
reader: BlockReaderRef<'a>,
|
||||
}
|
||||
|
||||
impl<'a> BlockCursor<'a> {
|
||||
pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
|
||||
Self::new_with_compression(reader, false)
|
||||
}
|
||||
pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
|
||||
BlockCursor {
|
||||
read_compressed,
|
||||
reader,
|
||||
}
|
||||
BlockCursor { reader }
|
||||
}
|
||||
// Needed by cli
|
||||
pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
|
||||
BlockCursor {
|
||||
read_compressed: false,
|
||||
reader: BlockReaderRef::FileBlockReader(reader),
|
||||
}
|
||||
}
|
||||
@@ -180,7 +150,7 @@ impl<'a> BlockCursor<'a> {
|
||||
pub async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum, ctx).await
|
||||
}
|
||||
@@ -190,31 +160,16 @@ impl<'a> BlockCursor<'a> {
|
||||
///
|
||||
/// The file is assumed to be immutable. This doesn't provide any functions
|
||||
/// for modifying the file, nor for invalidating the cache if it is modified.
|
||||
#[derive(Clone)]
|
||||
pub struct FileBlockReader<'a> {
|
||||
pub file: &'a VirtualFile,
|
||||
|
||||
/// Unique ID of this file, used as key in the page cache.
|
||||
file_id: page_cache::FileId,
|
||||
|
||||
compressed_reads: bool,
|
||||
}
|
||||
|
||||
impl<'a> FileBlockReader<'a> {
|
||||
pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
|
||||
Self::new_with_compression(file, file_id, false)
|
||||
}
|
||||
|
||||
pub fn new_with_compression(
|
||||
file: &'a VirtualFile,
|
||||
file_id: FileId,
|
||||
compressed_reads: bool,
|
||||
) -> Self {
|
||||
FileBlockReader {
|
||||
file_id,
|
||||
file,
|
||||
compressed_reads,
|
||||
}
|
||||
FileBlockReader { file_id, file }
|
||||
}
|
||||
|
||||
/// Read a page from the underlying file into given buffer.
|
||||
@@ -222,7 +177,7 @@ impl<'a> FileBlockReader<'a> {
|
||||
&self,
|
||||
buf: PageWriteGuard<'static>,
|
||||
blkno: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<PageWriteGuard<'static>, std::io::Error> {
|
||||
assert!(buf.len() == PAGE_SZ);
|
||||
self.file
|
||||
@@ -237,7 +192,7 @@ impl<'a> FileBlockReader<'a> {
|
||||
pub async fn read_blk<'b>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockLease<'b>, std::io::Error> {
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
@@ -261,10 +216,7 @@ impl<'a> FileBlockReader<'a> {
|
||||
|
||||
impl BlockReader for FileBlockReader<'_> {
|
||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||
BlockCursor::new_with_compression(
|
||||
BlockReaderRef::FileBlockReader(self),
|
||||
self.compressed_reads,
|
||||
)
|
||||
BlockCursor::new(BlockReaderRef::FileBlockReader(self))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -281,6 +281,22 @@ impl LocationConf {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LocationConf {
|
||||
// TODO: this should be removed once tenant loading can guarantee that we are never
|
||||
// loading from a directory without a configuration.
|
||||
// => tech debt since https://github.com/neondatabase/neon/issues/1555
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: Generation::none(),
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
tenant_conf: TenantConfOpt::default(),
|
||||
shard: ShardIdentity::unsharded(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A tenant's calcuated configuration, which is the result of merging a
|
||||
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
|
||||
///
|
||||
|
||||
662
pageserver/src/tenant/delete.rs
Normal file
662
pageserver/src/tenant/delete.rs
Normal file
@@ -0,0 +1,662 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::{models::TenantState, shard::TenantShardId};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, Instrument};
|
||||
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
timeline::ShutdownMode,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
|
||||
};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
Get(#[from] GetTenantError),
|
||||
|
||||
#[error("Tenant not attached")]
|
||||
NotAttached,
|
||||
|
||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||
InvalidState(TenantState),
|
||||
|
||||
#[error("Tenant deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error("Tenant map slot error {0}")]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
|
||||
#[error("Tenant map slot upsert error {0}")]
|
||||
SlotUpsertError(#[from] TenantSlotUpsertError),
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
|
||||
|
||||
fn remote_tenant_delete_mark_path(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let tenant_remote_path = conf
|
||||
.tenant_path(tenant_shard_id)
|
||||
.strip_prefix(&conf.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.context("tenant path")?;
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
|
||||
let data: &[u8] = &[];
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let data = bytes::Bytes::from_static(data);
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||
remote_storage
|
||||
.upload(stream, 0, &remote_mark_path, None, cancel)
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("mark_upload")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_local_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
|
||||
|
||||
// Note: we're ok to replace existing file.
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn schedule_ordered_timeline_deletions(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
|
||||
// Tenant is stopping at this point. We know it will be deleted.
|
||||
// No new timelines should be created.
|
||||
// Tree sort timelines to delete from leafs to the root.
|
||||
// NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
|
||||
// can complete and remove timeline from the map in between our call to clone
|
||||
// and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
|
||||
// timelines.lock is currently synchronous so we cant hold it across await point.
|
||||
// So just ignore NotFound error if we get it from `run`.
|
||||
// Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
|
||||
let timelines = tenant.timelines.lock().unwrap().clone();
|
||||
let sorted =
|
||||
tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
|
||||
|
||||
let mut already_running_deletions = vec![];
|
||||
|
||||
for (timeline_id, _) in sorted.into_iter().rev() {
|
||||
let span = tracing::info_span!("timeline_delete", %timeline_id);
|
||||
let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
|
||||
.instrument(span)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
match e {
|
||||
DeleteTimelineError::NotFound => {
|
||||
// Timeline deletion finished after call to clone above but before call
|
||||
// to `DeleteTimelineFlow::run` and removed timeline from the map.
|
||||
continue;
|
||||
}
|
||||
DeleteTimelineError::AlreadyInProgress(guard) => {
|
||||
already_running_deletions.push((guard, timeline_id));
|
||||
continue;
|
||||
}
|
||||
e => return Err(DeleteTenantError::Timeline(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(already_running_deletions)
|
||||
}
|
||||
|
||||
async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
|
||||
// Assert timelines dir is empty.
|
||||
if !fs_ext::is_directory_empty(timelines_path).await? {
|
||||
// Display first 10 items in directory
|
||||
let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
|
||||
let list = &list.into_iter().take(10).collect::<Vec<_>>();
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"Timelines directory is not empty after all timelines deletion: {list:?}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_tenant_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path, cancel).await },
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
||||
async fn cleanup_remaining_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
|
||||
if is_dir {
|
||||
tokio::fs::remove_dir(&p).await
|
||||
} else {
|
||||
tokio::fs::remove_file(&p).await
|
||||
}
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.with_context(|| format!("failed to delete {p}"))
|
||||
};
|
||||
|
||||
rm(conf.tenant_config_path(tenant_shard_id), false).await?;
|
||||
rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-timelines-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.timelines_path(tenant_shard_id), true).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-deleted-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let tenant_path = &conf.tenant_path(tenant_shard_id);
|
||||
if tenant_path.exists() {
|
||||
crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
}
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
|
||||
|
||||
rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.tenant_path(tenant_shard_id), true).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
/// 1. Upload remote deletion mark.
|
||||
/// 2. Create local mark file.
|
||||
/// 3. Shutdown tasks
|
||||
/// 4. Run ordered timeline deletions
|
||||
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are two entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
#[default]
|
||||
NotStarted,
|
||||
InProgress,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl DeleteTenantFlow {
|
||||
// These steps are run in the context of management api request handler.
|
||||
// Long running steps are continued to run in the background.
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
// NOTE: static needed for background part.
|
||||
// We assume that calling code sets up the span with tenant_id.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
pausable_failpoint!("tenant-delete-before-run");
|
||||
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
|
||||
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
|
||||
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
|
||||
// So the solution is to set tenant state to broken.
|
||||
async fn run_inner(
|
||||
guard: &mut OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant: &Tenant,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-remote-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||
.await
|
||||
.context("remote_mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-local-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_local_delete_mark(conf, &tenant.tenant_shard_id)
|
||||
.await
|
||||
.context("local delete mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-background", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-background"
|
||||
))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
|
||||
Self::InProgress { .. } => { /* We're in a retry */ }
|
||||
Self::NotStarted => { /* Fresh start */ }
|
||||
}
|
||||
|
||||
*self = Self::InProgress;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_mark_exists: bool,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
Some(
|
||||
Arc::clone(&t.delete_progress)
|
||||
.try_lock_owned()
|
||||
.expect("we're the only owner during init"),
|
||||
)
|
||||
};
|
||||
|
||||
if remote_mark_exists {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf
|
||||
.tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
|
||||
.exists()
|
||||
{
|
||||
Ok(acquire(tenant))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_attach(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
tenant
|
||||
.set_stopping(progress, false, true)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(preload, super::SpawnMode::Eager, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Check whether background deletion of this tenant is currently in progress
|
||||
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
|
||||
tenant.delete_progress.try_lock().is_err()
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||
// Broken and Stopping is needed for retries.
|
||||
if !matches!(
|
||||
tenant.current_state(),
|
||||
TenantState::Active | TenantState::Broken { .. }
|
||||
) {
|
||||
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
|
||||
}
|
||||
|
||||
let guard = Arc::clone(&tenant.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-shutdown", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
|
||||
});
|
||||
|
||||
// make pageserver shutdown not to wait for our completion
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
|
||||
// i e it is an error to do:
|
||||
// tenant.set_stopping
|
||||
// tenant.shutdown
|
||||
// Its also bad that we're holding tenants.read here.
|
||||
// TODO relax set_stopping to be idempotent?
|
||||
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"tenant shutdown is already in progress"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(guard)
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
let tenant_shard_id = tenant.tenant_shard_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
"tenant_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) =
|
||||
Self::background(guard, conf, remote_storage, tenants, &tenant).await
|
||||
{
|
||||
error!("Error: {err:#}");
|
||||
tenant.set_broken(format!("{err:#}")).await;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
|
||||
);
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||
// Note that if deletion fails we dont mark timelines as broken,
|
||||
// the whole tenant will become broken as by `Self::schedule_background` logic
|
||||
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
|
||||
.await
|
||||
.context("schedule_ordered_timeline_deletions")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-polling-ongoing-deletions"
|
||||
))?
|
||||
});
|
||||
|
||||
// Wait for deletions that were already running at the moment when tenant deletion was requested.
|
||||
// When we can lock deletion guard it means that corresponding timeline deletion finished.
|
||||
for (guard, timeline_id) in already_running_timeline_deletions {
|
||||
let flow = guard.lock().await;
|
||||
if !flow.is_finished() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"already running timeline deletion failed: {timeline_id}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Remove top-level tenant objects that don't belong to a timeline, such as heatmap
|
||||
let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
|
||||
if let Some(Err(e)) = backoff::retry(
|
||||
|| async {
|
||||
remote_storage
|
||||
.delete(&heatmap_path, &task_mgr::shutdown_token())
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_remote_tenant_heatmap",
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
|
||||
}
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
|
||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||
if timelines_path.exists() {
|
||||
// sanity check to guard against layout changes
|
||||
ensure_timelines_dir_empty(&timelines_path)
|
||||
.await
|
||||
.context("timelines dir not empty")?;
|
||||
}
|
||||
|
||||
remove_tenant_remote_delete_mark(
|
||||
conf,
|
||||
&remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
|
||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
|
||||
))?
|
||||
});
|
||||
|
||||
cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
{
|
||||
pausable_failpoint!("tenant-delete-before-map-remove");
|
||||
|
||||
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
|
||||
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
|
||||
//
|
||||
// This complexity will go away when we simplify how deletion works:
|
||||
// https://github.com/neondatabase/neon/issues/5080
|
||||
loop {
|
||||
// Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
|
||||
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
|
||||
let barrier = {
|
||||
let mut locked = tenants.write().unwrap();
|
||||
let removed = locked.remove(tenant.tenant_shard_id);
|
||||
|
||||
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||
|
||||
// Update stats
|
||||
match &removed {
|
||||
TenantsMapRemoveResult::Occupied(slot) => {
|
||||
crate::metrics::TENANT_MANAGER.slot_removed(slot);
|
||||
}
|
||||
TenantsMapRemoveResult::InProgress(barrier) => {
|
||||
crate::metrics::TENANT_MANAGER
|
||||
.slot_removed(&TenantSlot::InProgress(barrier.clone()));
|
||||
}
|
||||
TenantsMapRemoveResult::Vacant => {
|
||||
// Nothing changed in map, no metric update
|
||||
}
|
||||
}
|
||||
|
||||
match removed {
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
|
||||
match tenant.current_state() {
|
||||
TenantState::Stopping { .. } | TenantState::Broken { .. } => {
|
||||
// Expected: we put the tenant into stopping state before we start deleting it
|
||||
}
|
||||
state => {
|
||||
// Unexpected state
|
||||
tracing::warn!(
|
||||
"Tenant in unexpected state {state} after deletion"
|
||||
);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
|
||||
// This is unexpected: this secondary tenants should not have been created, and we
|
||||
// are not in a position to shut it down from here.
|
||||
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
|
||||
unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
|
||||
}
|
||||
TenantsMapRemoveResult::Vacant => {
|
||||
tracing::warn!(
|
||||
"Tenant removed from TenantsMap before deletion completed"
|
||||
);
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::InProgress(barrier) => {
|
||||
// An InProgress entry was found, we must wait on its barrier
|
||||
barrier
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Waiting for competing operation to complete before deleting state for tenant"
|
||||
);
|
||||
barrier.wait().await;
|
||||
}
|
||||
}
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -212,7 +212,6 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
///
|
||||
/// Public reader object, to search the tree.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct DiskBtreeReader<R, const L: usize>
|
||||
where
|
||||
R: BlockReader,
|
||||
@@ -243,7 +242,7 @@ where
|
||||
///
|
||||
/// Read the value for given key. Returns the value, or None if it doesn't exist.
|
||||
///
|
||||
pub async fn get(&self, search_key: &[u8; L], ctx: &RequestContext) -> Result<Option<u64>> {
|
||||
pub async fn get(&self, search_key: &[u8; L], ctx: &mut RequestContext) -> Result<Option<u64>> {
|
||||
let mut result: Option<u64> = None;
|
||||
self.visit(
|
||||
search_key,
|
||||
@@ -260,38 +259,27 @@ where
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
|
||||
where
|
||||
R: 'a,
|
||||
{
|
||||
pub fn iter<'a>(
|
||||
&'a self,
|
||||
start_key: &'a [u8; L],
|
||||
ctx: &'a RequestContext,
|
||||
) -> DiskBtreeIterator<'a> {
|
||||
DiskBtreeIterator {
|
||||
stream: Box::pin(self.into_stream(start_key, ctx)),
|
||||
stream: Box::pin(self.get_stream_from(start_key, ctx)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a stream which yields all key, value pairs from the index
|
||||
/// starting from the first key greater or equal to `start_key`.
|
||||
///
|
||||
/// Note 1: that this is a copy of [`Self::visit`].
|
||||
/// Note that this is a copy of [`Self::visit`].
|
||||
/// TODO: Once the sequential read path is removed this will become
|
||||
/// the only index traversal method.
|
||||
///
|
||||
/// Note 2: this function used to take `&self` but it now consumes `self`. This is due to
|
||||
/// the lifetime constraints of the reader and the stream / iterator it creates. Using `&self`
|
||||
/// requires the reader to be present when the stream is used, and this creates a lifetime
|
||||
/// dependency between the reader and the stream. Now if we want to create an iterator that
|
||||
/// holds the stream, someone will need to keep a reference to the reader, which is inconvenient
|
||||
/// to use from the image/delta layer APIs.
|
||||
///
|
||||
/// Feel free to add the `&self` variant back if it's necessary.
|
||||
pub fn into_stream<'a>(
|
||||
self,
|
||||
pub fn get_stream_from<'a>(
|
||||
&'a self,
|
||||
start_key: &'a [u8; L],
|
||||
ctx: &'a RequestContext,
|
||||
) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a
|
||||
where
|
||||
R: 'a,
|
||||
{
|
||||
ctx: &'a mut RequestContext,
|
||||
) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
|
||||
try_stream! {
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
@@ -375,7 +363,7 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
mut visitor: V,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
|
||||
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
|
||||
}
|
||||
|
||||
mod page_caching;
|
||||
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
|
||||
mod zero_padded_read_write;
|
||||
|
||||
impl EphemeralFile {
|
||||
@@ -29,7 +28,7 @@ impl EphemeralFile {
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<EphemeralFile, io::Error> {
|
||||
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
||||
let filename_disambiguator =
|
||||
@@ -54,7 +53,7 @@ impl EphemeralFile {
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
|
||||
rw: page_caching::RW::new(file),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -66,15 +65,10 @@ impl EphemeralFile {
|
||||
self.rw.page_cache_file_id()
|
||||
}
|
||||
|
||||
/// See [`self::page_caching::RW::load_to_vec`].
|
||||
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||
self.rw.load_to_vec(ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
self.rw.read_blk(blknum, ctx).await
|
||||
}
|
||||
@@ -82,7 +76,7 @@ impl EphemeralFile {
|
||||
pub(crate) async fn write_blob(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ use crate::virtual_file::VirtualFile;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::{Deref, Range};
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use tracing::*;
|
||||
|
||||
@@ -20,23 +19,14 @@ pub struct RW {
|
||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||
}
|
||||
|
||||
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||
/// should we pre-warm the [`crate::page_cache`] with the contents?
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PrewarmOnWrite {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
|
||||
pub fn new(file: VirtualFile) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
|
||||
page_cache_file_id,
|
||||
file,
|
||||
prewarm_on_write,
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -48,7 +38,7 @@ impl RW {
|
||||
pub(crate) async fn write_all_borrowed(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<usize, io::Error> {
|
||||
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
|
||||
// because Compute is unlikely to access recently written data.
|
||||
@@ -59,47 +49,10 @@ impl RW {
|
||||
self.rw.bytes_written()
|
||||
}
|
||||
|
||||
/// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
|
||||
///
|
||||
/// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
|
||||
/// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
|
||||
pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||
// round up to the next PAGE_SZ multiple, required by blob_io
|
||||
let size = {
|
||||
let s = usize::try_from(self.bytes_written()).unwrap();
|
||||
if s % PAGE_SZ == 0 {
|
||||
s
|
||||
} else {
|
||||
s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
|
||||
}
|
||||
};
|
||||
let vec = Vec::with_capacity(size);
|
||||
|
||||
// read from disk what we've already flushed
|
||||
let writer = self.rw.as_writer();
|
||||
let flushed_range = writer.written_range();
|
||||
let mut vec = writer
|
||||
.file
|
||||
.read_exact_at(
|
||||
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
||||
u64::try_from(flushed_range.start).unwrap(),
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
|
||||
let buffered = self.rw.get_tail_zero_padded();
|
||||
vec.extend_from_slice(buffered);
|
||||
assert_eq!(vec.len(), size);
|
||||
assert_eq!(vec.len() % PAGE_SZ, 0);
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
match self.rw.read_blk(blknum).await? {
|
||||
zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
|
||||
@@ -163,40 +116,19 @@ impl Drop for RW {
|
||||
}
|
||||
|
||||
struct PreWarmingWriter {
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
nwritten_blocks: u32,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
impl PreWarmingWriter {
|
||||
fn new(
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
) -> Self {
|
||||
fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
|
||||
Self {
|
||||
prewarm_on_write,
|
||||
nwritten_blocks: 0,
|
||||
page_cache_file_id,
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the byte range within `file` that has been written though `write_all`.
|
||||
///
|
||||
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
|
||||
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
|
||||
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
|
||||
struct Wrapper(Range<usize>);
|
||||
impl Deref for Wrapper {
|
||||
type Target = Range<usize>;
|
||||
fn deref(&self) -> &Range<usize> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
Wrapper(0..nwritten_blocks * PAGE_SZ)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
|
||||
@@ -206,7 +138,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
|
||||
>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let buf = buf.slice(..);
|
||||
let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
|
||||
@@ -246,51 +178,45 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
|
||||
assert_eq!(&check_bounds_stuff_works, &*buf);
|
||||
}
|
||||
|
||||
// Pre-warm page cache with the contents.
|
||||
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
||||
// benefits the code that writes InMemoryLayer=>L0 layers.
|
||||
let nblocks = buflen / PAGE_SZ;
|
||||
let nblocks32 = u32::try_from(nblocks).unwrap();
|
||||
|
||||
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
|
||||
// Pre-warm page cache with the contents.
|
||||
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
||||
// benefits the code that writes InMemoryLayer=>L0 layers.
|
||||
|
||||
let cache = page_cache::get();
|
||||
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
||||
RequestContext::new(
|
||||
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
||||
crate::context::DownloadBehavior::Error,
|
||||
)
|
||||
});
|
||||
for blknum_in_buffer in 0..nblocks {
|
||||
let blk_in_buffer =
|
||||
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
||||
let blknum = self
|
||||
.nwritten_blocks
|
||||
.checked_add(blknum_in_buffer as u32)
|
||||
.unwrap();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||
}
|
||||
Ok(v) => match v {
|
||||
page_cache::ReadBufResult::Found(_guard) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
||||
and this function takes &mut self, so, no concurrent read_blk is possible");
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(blk_in_buffer);
|
||||
let _ = write_guard.mark_valid();
|
||||
}
|
||||
},
|
||||
let cache = page_cache::get();
|
||||
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
||||
RequestContext::new(
|
||||
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
||||
crate::context::DownloadBehavior::Error,
|
||||
)
|
||||
});
|
||||
for blknum_in_buffer in 0..nblocks {
|
||||
let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
||||
let blknum = self
|
||||
.nwritten_blocks
|
||||
.checked_add(blknum_in_buffer as u32)
|
||||
.unwrap();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||
}
|
||||
Ok(v) => match v {
|
||||
page_cache::ReadBufResult::Found(_guard) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
||||
and this function takes &mut self, so, no concurrent read_blk is possible");
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(blk_in_buffer);
|
||||
let _ = write_guard.mark_valid();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
|
||||
Ok((buflen, buf.into_inner()))
|
||||
}
|
||||
|
||||
@@ -64,7 +64,7 @@ where
|
||||
pub async fn write_all_borrowed(
|
||||
&mut self,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> std::io::Result<usize> {
|
||||
self.buffered_writer.write_buffered_borrowed(buf, ctx).await
|
||||
}
|
||||
@@ -75,21 +75,6 @@ where
|
||||
flushed_offset + u64::try_from(buffer.pending()).unwrap()
|
||||
}
|
||||
|
||||
/// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
|
||||
pub fn get_tail_zero_padded(&self) -> &[u8] {
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
let buffer_written_up_to = buffer.pending();
|
||||
// pad to next page boundary
|
||||
let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
|
||||
buffer_written_up_to
|
||||
} else {
|
||||
buffer_written_up_to
|
||||
.checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
|
||||
.unwrap()
|
||||
};
|
||||
&buffer.as_zero_padded_slice()[0..read_up_to]
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
|
||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
|
||||
@@ -850,7 +850,7 @@ impl LayerMap {
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &mut RequestContext) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
|
||||
println!("open_layer:");
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -518,8 +518,8 @@ impl RemoteTimelineClient {
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<u64> {
|
||||
let downloaded_size = {
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
&RemoteOpFileKind::Layer,
|
||||
|
||||
@@ -52,7 +52,7 @@ pub async fn download_layer_file<'a>(
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -78,8 +78,33 @@ pub async fn download_layer_file<'a>(
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
struct DownloadObjectClosure<'a> {
|
||||
storage: &'a GenericRemoteStorage,
|
||||
remote_path: &'a RemotePath,
|
||||
temp_file_path: &'a Utf8PathBuf,
|
||||
cancel: &'a CancellationToken,
|
||||
ctx: &'a mut RequestContext,
|
||||
}
|
||||
impl backoff::Op<u64, DownloadError> for DownloadObjectClosure<'_> {
|
||||
async fn call(&mut self) -> Result<u64, DownloadError> {
|
||||
let DownloadObjectClosure {
|
||||
storage,
|
||||
remote_path,
|
||||
temp_file_path,
|
||||
cancel,
|
||||
ctx,
|
||||
} = self;
|
||||
download_object(storage, remote_path, temp_file_path, cancel, ctx).await
|
||||
}
|
||||
}
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
|
||||
DownloadObjectClosure {
|
||||
storage,
|
||||
remote_path: &remote_path,
|
||||
temp_file_path: &temp_file_path,
|
||||
cancel,
|
||||
ctx,
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
@@ -107,9 +132,9 @@ pub async fn download_layer_file<'a>(
|
||||
// the in-memory state of the filesystem already has the layer file in its final place,
|
||||
// and subsequent pageserver code could think it's durable while it really isn't.
|
||||
let work = {
|
||||
let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior());
|
||||
let mut ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior());
|
||||
async move {
|
||||
let timeline_dir = VirtualFile::open(&timeline_path, &ctx)
|
||||
let timeline_dir = VirtualFile::open(&timeline_path, &mut ctx)
|
||||
.await
|
||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
||||
timeline_dir
|
||||
@@ -140,7 +165,7 @@ async fn download_object<'a>(
|
||||
src_path: &RemotePath,
|
||||
dst_path: &Utf8PathBuf,
|
||||
cancel: &CancellationToken,
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &mut RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let res = match crate::virtual_file::io_engine::get() {
|
||||
crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
|
||||
@@ -568,15 +593,11 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
/// with backoff.
|
||||
///
|
||||
/// (See similar logic for uploads in `perform_upload_task`)
|
||||
pub(super) async fn download_retry<T, O, F>(
|
||||
op: O,
|
||||
pub(super) async fn download_retry<T>(
|
||||
op: impl backoff::Op<T, DownloadError>,
|
||||
description: &str,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, DownloadError>>,
|
||||
{
|
||||
) -> Result<T, DownloadError> {
|
||||
backoff::retry(
|
||||
op,
|
||||
DownloadError::is_permanent,
|
||||
|
||||
@@ -23,8 +23,6 @@ use super::{
|
||||
storage_layer::LayerName,
|
||||
};
|
||||
|
||||
use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
|
||||
use metrics::UIntGauge;
|
||||
use pageserver_api::{
|
||||
models,
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -101,17 +99,6 @@ pub(crate) struct SecondaryTenant {
|
||||
|
||||
// Public state indicating overall progress of downloads relative to the last heatmap seen
|
||||
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
|
||||
|
||||
// Sum of layer sizes on local disk
|
||||
pub(super) resident_size_metric: UIntGauge,
|
||||
}
|
||||
|
||||
impl Drop for SecondaryTenant {
|
||||
fn drop(&mut self) {
|
||||
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
|
||||
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
}
|
||||
}
|
||||
|
||||
impl SecondaryTenant {
|
||||
@@ -121,12 +108,6 @@ impl SecondaryTenant {
|
||||
tenant_conf: TenantConfOpt,
|
||||
config: &SecondaryLocationConfig,
|
||||
) -> Arc<Self> {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||
let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
Arc::new(Self {
|
||||
tenant_shard_id,
|
||||
// todo: shall we make this a descendent of the
|
||||
@@ -142,8 +123,6 @@ impl SecondaryTenant {
|
||||
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||
|
||||
progress: std::sync::Mutex::default(),
|
||||
|
||||
resident_size_metric,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -232,12 +211,16 @@ impl SecondaryTenant {
|
||||
// have to 100% match what is on disk, because it's a best-effort warming
|
||||
// of the cache.
|
||||
let mut detail = this.detail.lock().unwrap();
|
||||
if let Some(removed) =
|
||||
detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
|
||||
{
|
||||
// We might race with removal of the same layer during downloads, so finding the layer we
|
||||
// were trying to remove is optional. Only issue the disk I/O to remove it if we found it.
|
||||
removed.remove_blocking();
|
||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
||||
let removed = timeline_detail.on_disk_layers.remove(&name);
|
||||
|
||||
// We might race with removal of the same layer during downloads, if it was removed
|
||||
// from the heatmap. If we see that the OnDiskState is gone, then no need to
|
||||
// do a physical deletion or store in evicted_at.
|
||||
if let Some(removed) = removed {
|
||||
removed.remove_blocking();
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
|
||||
@@ -46,7 +46,6 @@ use crate::tenant::{
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use metrics::UIntGauge;
|
||||
use pageserver_api::models::SecondaryProgress;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||
@@ -132,66 +131,16 @@ impl OnDiskState {
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("Deleting secondary layer")
|
||||
}
|
||||
|
||||
pub(crate) fn file_size(&self) -> u64 {
|
||||
self.metadata.file_size
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(super) struct SecondaryDetailTimeline {
|
||||
on_disk_layers: HashMap<LayerName, OnDiskState>,
|
||||
pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
|
||||
|
||||
/// We remember when layers were evicted, to prevent re-downloading them.
|
||||
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
|
||||
}
|
||||
|
||||
impl SecondaryDetailTimeline {
|
||||
pub(super) fn remove_layer(
|
||||
&mut self,
|
||||
name: &LayerName,
|
||||
resident_metric: &UIntGauge,
|
||||
) -> Option<OnDiskState> {
|
||||
let removed = self.on_disk_layers.remove(name);
|
||||
if let Some(removed) = &removed {
|
||||
resident_metric.sub(removed.file_size());
|
||||
}
|
||||
removed
|
||||
}
|
||||
|
||||
/// `local_path`
|
||||
fn touch_layer<F>(
|
||||
&mut self,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
touched: &HeatMapLayer,
|
||||
resident_metric: &UIntGauge,
|
||||
local_path: F,
|
||||
) where
|
||||
F: FnOnce() -> Utf8PathBuf,
|
||||
{
|
||||
use std::collections::hash_map::Entry;
|
||||
match self.on_disk_layers.entry(touched.name.clone()) {
|
||||
Entry::Occupied(mut v) => {
|
||||
v.get_mut().access_time = touched.access_time;
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
e.insert(OnDiskState::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
touched.name.clone(),
|
||||
touched.metadata.clone(),
|
||||
touched.access_time,
|
||||
local_path(),
|
||||
));
|
||||
resident_metric.add(touched.metadata.file_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Aspects of a heatmap that we remember after downloading it
|
||||
#[derive(Clone, Debug)]
|
||||
struct DownloadSummary {
|
||||
@@ -209,7 +158,7 @@ pub(super) struct SecondaryDetail {
|
||||
|
||||
last_download: Option<DownloadSummary>,
|
||||
next_download: Option<Instant>,
|
||||
timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
|
||||
/// Helper for logging SystemTime
|
||||
@@ -242,38 +191,6 @@ impl SecondaryDetail {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn evict_layer(
|
||||
&mut self,
|
||||
name: LayerName,
|
||||
timeline_id: &TimelineId,
|
||||
now: SystemTime,
|
||||
resident_metric: &UIntGauge,
|
||||
) -> Option<OnDiskState> {
|
||||
let timeline = self.timelines.get_mut(timeline_id)?;
|
||||
let removed = timeline.remove_layer(&name, resident_metric);
|
||||
if removed.is_some() {
|
||||
timeline.evicted_at.insert(name, now);
|
||||
}
|
||||
removed
|
||||
}
|
||||
|
||||
pub(super) fn remove_timeline(
|
||||
&mut self,
|
||||
timeline_id: &TimelineId,
|
||||
resident_metric: &UIntGauge,
|
||||
) {
|
||||
let removed = self.timelines.remove(timeline_id);
|
||||
if let Some(removed) = removed {
|
||||
resident_metric.sub(
|
||||
removed
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|l| l.metadata.file_size)
|
||||
.sum(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Additionally returns the total number of layers, used for more stable relative access time
|
||||
/// based eviction.
|
||||
pub(super) fn get_layers_for_eviction(
|
||||
@@ -345,7 +262,6 @@ impl scheduler::RunningJob for RunningDownload {
|
||||
struct CompleteDownload {
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
completed_at: Instant,
|
||||
result: Result<(), UpdateError>,
|
||||
}
|
||||
|
||||
impl scheduler::Completion for CompleteDownload {
|
||||
@@ -370,33 +286,21 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
let CompleteDownload {
|
||||
secondary_state,
|
||||
completed_at: _completed_at,
|
||||
result,
|
||||
} = completion;
|
||||
|
||||
tracing::debug!("Secondary tenant download completed");
|
||||
|
||||
let mut detail = secondary_state.detail.lock().unwrap();
|
||||
|
||||
match result {
|
||||
Err(UpdateError::Restart) => {
|
||||
// Start downloading again as soon as we can. This will involve waiting for the scheduler's
|
||||
// scheduling interval. This slightly reduces the peak download speed of tenants that hit their
|
||||
// deadline and keep restarting, but that also helps give other tenants a chance to execute rather
|
||||
// that letting one big tenant dominate for a long time.
|
||||
detail.next_download = Some(Instant::now());
|
||||
}
|
||||
_ => {
|
||||
let period = detail
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
let period = detail
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
|
||||
// We advance next_download irrespective of errors: we don't want error cases to result in
|
||||
// expensive busy-polling.
|
||||
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
|
||||
}
|
||||
}
|
||||
// We advance next_download irrespective of errors: we don't want error cases to result in
|
||||
// expensive busy-polling.
|
||||
detail.next_download = Some(Instant::now() + period_jitter(period, 5));
|
||||
}
|
||||
|
||||
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||
@@ -492,10 +396,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
(RunningDownload { barrier }, Box::pin(async move {
|
||||
let _completion = completion;
|
||||
|
||||
let result = TenantDownloader::new(conf, &remote_storage, &secondary_state)
|
||||
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
|
||||
.download(&download_ctx)
|
||||
.await;
|
||||
match &result
|
||||
.await
|
||||
{
|
||||
Err(UpdateError::NoData) => {
|
||||
tracing::info!("No heatmap found for tenant. This is fine if it is new.");
|
||||
@@ -512,9 +415,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
|
||||
tracing::error!("Error while downloading tenant: {e}");
|
||||
},
|
||||
Err(UpdateError::Restart) => {
|
||||
tracing::info!("Download reached deadline & will restart to update heatmap")
|
||||
}
|
||||
Ok(()) => {}
|
||||
};
|
||||
|
||||
@@ -536,7 +436,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
CompleteDownload {
|
||||
secondary_state,
|
||||
completed_at: Instant::now(),
|
||||
result
|
||||
}
|
||||
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||
}
|
||||
@@ -553,11 +452,6 @@ struct TenantDownloader<'a> {
|
||||
/// Errors that may be encountered while updating a tenant
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum UpdateError {
|
||||
/// This is not a true failure, but it's how a download indicates that it would like to be restarted by
|
||||
/// the scheduler, to pick up the latest heatmap
|
||||
#[error("Reached deadline, restarting downloads")]
|
||||
Restart,
|
||||
|
||||
#[error("No remote data found")]
|
||||
NoData,
|
||||
#[error("Insufficient local storage space")]
|
||||
@@ -612,7 +506,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> {
|
||||
async fn download(&self, ctx: &mut RequestContext) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
// For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
|
||||
@@ -684,13 +578,8 @@ impl<'a> TenantDownloader<'a> {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state = init_timeline_state(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
timeline,
|
||||
&self.secondary_state.resident_size_metric,
|
||||
)
|
||||
.await;
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
@@ -714,26 +603,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
self.prepare_timelines(&heatmap, heatmap_mtime).await?;
|
||||
}
|
||||
|
||||
// Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again,
|
||||
// so that we are always using reasonably a fresh heatmap. Otherwise, if we had really huge content to download, we might
|
||||
// spend 10s of minutes downloading layers we don't need.
|
||||
// (see https://github.com/neondatabase/neon/issues/8182)
|
||||
let deadline = {
|
||||
let period = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_download
|
||||
.as_ref()
|
||||
.map(|d| d.upload_period)
|
||||
.unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
|
||||
|
||||
// Use double the period: we are not promising to complete within the period, this is just a heuristic
|
||||
// to keep using a "reasonably fresh" heatmap.
|
||||
Instant::now() + period * 2
|
||||
};
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
let timeline_state = timeline_states
|
||||
@@ -749,7 +618,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
self.download_timeline(timeline, timeline_state, deadline, ctx)
|
||||
self.download_timeline(timeline, timeline_state, ctx)
|
||||
.instrument(tracing::info_span!(
|
||||
"secondary_download_timeline",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
@@ -759,25 +628,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Metrics consistency check in testing builds
|
||||
if cfg!(feature = "testing") {
|
||||
let detail = self.secondary_state.detail.lock().unwrap();
|
||||
let resident_size = detail
|
||||
.timelines
|
||||
.values()
|
||||
.map(|tl| {
|
||||
tl.on_disk_layers
|
||||
.values()
|
||||
.map(|v| v.metadata.file_size)
|
||||
.sum::<u64>()
|
||||
})
|
||||
.sum::<u64>();
|
||||
assert_eq!(
|
||||
resident_size,
|
||||
self.secondary_state.resident_size_metric.get()
|
||||
);
|
||||
}
|
||||
|
||||
// Only update last_etag after a full successful download: this way will not skip
|
||||
// the next download, even if the heatmap's actual etag is unchanged.
|
||||
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
|
||||
@@ -890,7 +740,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
for delete_timeline in &delete_timelines {
|
||||
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
|
||||
// from disk fails that will be a fatal error.
|
||||
detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
|
||||
detail.timelines.remove(delete_timeline);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -908,7 +758,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
|
||||
continue;
|
||||
};
|
||||
timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
|
||||
timeline_state.on_disk_layers.remove(&layer_name);
|
||||
}
|
||||
|
||||
for timeline_id in delete_timelines {
|
||||
@@ -977,28 +827,26 @@ impl<'a> TenantDownloader<'a> {
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
/// Download heatmap layers that are not present on local disk, or update their
|
||||
/// access time if they are already present.
|
||||
async fn download_timeline_layers(
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline: HeatMapTimeline,
|
||||
timeline_state: SecondaryDetailTimeline,
|
||||
deadline: Instant,
|
||||
ctx: &RequestContext,
|
||||
) -> (Result<(), UpdateError>, Vec<HeatMapLayer>) {
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return (Err(UpdateError::Cancelled), touched);
|
||||
}
|
||||
|
||||
if Instant::now() > deadline {
|
||||
// We've been running downloads for a while, restart to download latest heatmap.
|
||||
return (Err(UpdateError::Restart), touched);
|
||||
return Err(UpdateError::Cancelled);
|
||||
}
|
||||
|
||||
// Existing on-disk layers: just update their access time.
|
||||
@@ -1068,66 +916,52 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.await
|
||||
.await?
|
||||
{
|
||||
Ok(Some(layer)) => touched.push(layer),
|
||||
Ok(None) => {
|
||||
Some(layer) => touched.push(layer),
|
||||
None => {
|
||||
// Not an error but we didn't download it: remote layer is missing. Don't add it to the list of
|
||||
// things to consider touched.
|
||||
}
|
||||
Err(e) => {
|
||||
return (Err(e), touched);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(Ok(()), touched)
|
||||
}
|
||||
|
||||
async fn download_timeline(
|
||||
&self,
|
||||
timeline: HeatMapTimeline,
|
||||
timeline_state: SecondaryDetailTimeline,
|
||||
deadline: Instant,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
let (result, touched) = self
|
||||
.download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
|
||||
.await;
|
||||
|
||||
// Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful
|
||||
// Write updates to state to record layers we just downloaded or touched.
|
||||
{
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
let timeline_detail = detail.timelines.entry(timeline_id).or_default();
|
||||
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
|
||||
|
||||
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
|
||||
touched.into_iter().for_each(|t| {
|
||||
timeline_detail.touch_layer(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline_id,
|
||||
&t,
|
||||
&self.secondary_state.resident_size_metric,
|
||||
|| {
|
||||
local_layer_path(
|
||||
|
||||
for t in touched {
|
||||
use std::collections::hash_map::Entry;
|
||||
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
|
||||
Entry::Occupied(mut v) => {
|
||||
v.get_mut().access_time = t.access_time;
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline_id,
|
||||
&timeline.timeline_id,
|
||||
&t.name,
|
||||
&t.metadata.generation,
|
||||
)
|
||||
},
|
||||
)
|
||||
});
|
||||
);
|
||||
e.insert(OnDiskState::new(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
t.name,
|
||||
t.metadata.clone(),
|
||||
t.access_time,
|
||||
local_path,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
|
||||
@@ -1144,7 +978,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
layer: HeatMapLayer,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Option<HeatMapLayer>, UpdateError> {
|
||||
// Failpoint for simulating slow remote storage
|
||||
failpoint_support::sleep_millis_async!(
|
||||
@@ -1233,7 +1067,6 @@ async fn init_timeline_state(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
heatmap: &HeatMapTimeline,
|
||||
resident_metric: &UIntGauge,
|
||||
) -> SecondaryDetailTimeline {
|
||||
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
|
||||
let mut detail = SecondaryDetailTimeline::default();
|
||||
@@ -1309,13 +1142,17 @@ async fn init_timeline_state(
|
||||
} else {
|
||||
// We expect the access time to be initialized immediately afterwards, when
|
||||
// the latest heatmap is applied to the state.
|
||||
detail.touch_layer(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&heatmap.timeline_id,
|
||||
remote_meta,
|
||||
resident_metric,
|
||||
|| file_path,
|
||||
detail.on_disk_layers.insert(
|
||||
name.clone(),
|
||||
OnDiskState::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&heatmap.timeline_id,
|
||||
name,
|
||||
remote_meta.metadata.clone(),
|
||||
remote_meta.access_time,
|
||||
file_path,
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -367,9 +367,10 @@ async fn upload_tenant_heatmap(
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let generation = tenant.get_generation();
|
||||
debug_assert!(!generation.is_none());
|
||||
if generation.is_none() {
|
||||
// We do not expect this: None generations should only appear in historic layer metadata, not in running Tenants
|
||||
// We do not expect this: generations were implemented before heatmap uploads. However,
|
||||
// handle it so that we don't have to make the generation in the heatmap an Option<>
|
||||
// (Generation::none is not serializable)
|
||||
tracing::warn!("Skipping heatmap upload for tenant with generation==None");
|
||||
return Ok(UploadHeatmapOutcome::Skipped);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use tenant_size_model::svg::SvgBranchKind;
|
||||
use tokio::sync::oneshot::error::RecvError;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -88,9 +87,6 @@ impl SegmentMeta {
|
||||
LsnKind::BranchPoint => true,
|
||||
LsnKind::GcCutOff => true,
|
||||
LsnKind::BranchEnd => false,
|
||||
LsnKind::LeasePoint => true,
|
||||
LsnKind::LeaseStart => false,
|
||||
LsnKind::LeaseEnd => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -107,21 +103,6 @@ pub enum LsnKind {
|
||||
GcCutOff,
|
||||
/// Last record LSN
|
||||
BranchEnd,
|
||||
/// A LSN lease is granted here.
|
||||
LeasePoint,
|
||||
/// A lease starts from here.
|
||||
LeaseStart,
|
||||
/// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
|
||||
LeaseEnd,
|
||||
}
|
||||
|
||||
impl From<LsnKind> for SvgBranchKind {
|
||||
fn from(kind: LsnKind) -> Self {
|
||||
match kind {
|
||||
LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
|
||||
_ => SvgBranchKind::Timeline,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
|
||||
@@ -143,9 +124,6 @@ pub struct TimelineInputs {
|
||||
|
||||
/// Cutoff point calculated from the user-supplied 'max_retention_period'
|
||||
retention_param_cutoff: Option<Lsn>,
|
||||
|
||||
/// Lease points on the timeline
|
||||
lease_points: Vec<Lsn>,
|
||||
}
|
||||
|
||||
/// Gathers the inputs for the tenant sizing model.
|
||||
@@ -170,7 +148,7 @@ pub(super) async fn gather_inputs(
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<ModelInputs, CalculateSyntheticSizeError> {
|
||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||
tenant.refresh_gc_info(cancel, ctx).await?;
|
||||
@@ -256,13 +234,6 @@ pub(super) async fn gather_inputs(
|
||||
None
|
||||
};
|
||||
|
||||
let lease_points = gc_info
|
||||
.leases
|
||||
.keys()
|
||||
.filter(|&&lsn| lsn > ancestor_lsn)
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||
// want to query any logical size before initdb_lsn.
|
||||
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
|
||||
@@ -277,8 +248,6 @@ pub(super) async fn gather_inputs(
|
||||
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
|
||||
|
||||
drop(gc_info);
|
||||
|
||||
// Add branch points we collected earlier, just in case there were any that were
|
||||
@@ -327,7 +296,6 @@ pub(super) async fn gather_inputs(
|
||||
if kind == LsnKind::BranchPoint {
|
||||
branchpoint_segments.insert((timeline_id, lsn), segments.len());
|
||||
}
|
||||
|
||||
segments.push(SegmentMeta {
|
||||
segment: Segment {
|
||||
parent: Some(parent),
|
||||
@@ -338,45 +306,7 @@ pub(super) async fn gather_inputs(
|
||||
timeline_id: timeline.timeline_id,
|
||||
kind,
|
||||
});
|
||||
|
||||
parent = segments.len() - 1;
|
||||
|
||||
if kind == LsnKind::LeasePoint {
|
||||
// Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
|
||||
// (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
|
||||
// value. Without the other two segments, the calculation code would not count the leased LSN as a point
|
||||
// to be retained.
|
||||
// Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
|
||||
//
|
||||
// Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
|
||||
// branch points can be given a synthetic id so we can unite them.
|
||||
let mut lease_parent = parent;
|
||||
|
||||
// Start of a lease.
|
||||
segments.push(SegmentMeta {
|
||||
segment: Segment {
|
||||
parent: Some(lease_parent),
|
||||
lsn: lsn.0,
|
||||
size: None, // Filled in later, if necessary
|
||||
needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
|
||||
},
|
||||
timeline_id: timeline.timeline_id,
|
||||
kind: LsnKind::LeaseStart,
|
||||
});
|
||||
lease_parent += 1;
|
||||
|
||||
// End of the lease.
|
||||
segments.push(SegmentMeta {
|
||||
segment: Segment {
|
||||
parent: Some(lease_parent),
|
||||
lsn: lsn.0,
|
||||
size: None, // Filled in later, if necessary
|
||||
needed: true, // everything at the lease LSN must be readable => is needed
|
||||
},
|
||||
timeline_id: timeline.timeline_id,
|
||||
kind: LsnKind::LeaseEnd,
|
||||
});
|
||||
}
|
||||
parent += 1;
|
||||
}
|
||||
|
||||
// Current end of the timeline
|
||||
@@ -402,7 +332,6 @@ pub(super) async fn gather_inputs(
|
||||
pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
retention_param_cutoff,
|
||||
lease_points,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -450,7 +379,7 @@ async fn fill_logical_sizes(
|
||||
limit: &Arc<Semaphore>,
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), CalculateSyntheticSizeError> {
|
||||
let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
|
||||
timelines
|
||||
@@ -745,8 +674,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"horizon_cutoff": "0/2210CD0",
|
||||
"pitr_cutoff": "0/2210CD0",
|
||||
"next_gc_cutoff": "0/2210CD0",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
"retention_param_cutoff": null
|
||||
},
|
||||
{
|
||||
"timeline_id": "454626700469f0a9914949b9d018e876",
|
||||
@@ -756,8 +684,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"horizon_cutoff": "0/1817770",
|
||||
"pitr_cutoff": "0/1817770",
|
||||
"next_gc_cutoff": "0/1817770",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
"retention_param_cutoff": null
|
||||
},
|
||||
{
|
||||
"timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
|
||||
@@ -767,8 +694,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"horizon_cutoff": "0/18B3D98",
|
||||
"pitr_cutoff": "0/18B3D98",
|
||||
"next_gc_cutoff": "0/18B3D98",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
"retention_param_cutoff": null
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -823,8 +749,7 @@ fn verify_size_for_one_branch() {
|
||||
"horizon_cutoff": "47/240A5860",
|
||||
"pitr_cutoff": "47/240A5860",
|
||||
"next_gc_cutoff": "47/240A5860",
|
||||
"retention_param_cutoff": "0/0",
|
||||
"lease_points": []
|
||||
"retention_param_cutoff": "0/0"
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
|
||||
@@ -425,7 +425,7 @@ impl ReadableLayer {
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
@@ -574,7 +574,7 @@ impl LayerAccessStats {
|
||||
});
|
||||
}
|
||||
|
||||
fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
|
||||
fn record_access(&self, access_kind: LayerAccessKind, ctx: &mut RequestContext) {
|
||||
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -249,7 +249,7 @@ impl AsLayerDesc for DeltaLayer {
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &mut RequestContext) -> Result<()> {
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
@@ -292,7 +292,7 @@ impl DeltaLayer {
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
// Quick exit if already loaded
|
||||
@@ -302,7 +302,7 @@ impl DeltaLayer {
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path()))
|
||||
}
|
||||
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
async fn load_inner(&self, ctx: &mut RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, None, None, ctx)
|
||||
@@ -393,7 +393,7 @@ impl DeltaLayerWriterInner {
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename. We don't know
|
||||
// the end key yet, so we cannot form the final filename yet. We will
|
||||
@@ -435,7 +435,7 @@ impl DeltaLayerWriterInner {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let (_, res) = self
|
||||
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
|
||||
@@ -449,15 +449,10 @@ impl DeltaLayerWriterInner {
|
||||
lsn: Lsn,
|
||||
val: Vec<u8>,
|
||||
will_init: bool,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
// We don't want to use compression in delta layer creation
|
||||
let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
|
||||
let (val, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(val, ctx, compression)
|
||||
.await;
|
||||
let (val, res) = self.blob_writer.write_blob(val, ctx).await;
|
||||
let off = match res {
|
||||
Ok(off) => off,
|
||||
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||
@@ -481,7 +476,7 @@ impl DeltaLayerWriterInner {
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let temp_path = self.path.clone();
|
||||
let result = self.finish0(key_end, timeline, ctx).await;
|
||||
@@ -498,7 +493,7 @@ impl DeltaLayerWriterInner {
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
@@ -608,7 +603,7 @@ impl DeltaLayerWriter {
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: Some(
|
||||
@@ -635,7 +630,7 @@ impl DeltaLayerWriter {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner
|
||||
.as_mut()
|
||||
@@ -650,7 +645,7 @@ impl DeltaLayerWriter {
|
||||
lsn: Lsn,
|
||||
val: Vec<u8>,
|
||||
will_init: bool,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
self.inner
|
||||
.as_mut()
|
||||
@@ -670,7 +665,7 @@ impl DeltaLayerWriter {
|
||||
mut self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
self.inner
|
||||
.take()
|
||||
@@ -709,7 +704,7 @@ impl DeltaLayer {
|
||||
pub async fn rewrite_summary<F>(
|
||||
path: &Utf8Path,
|
||||
rewrite: F,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), RewriteSummaryError>
|
||||
where
|
||||
F: Fn(Summary) -> Summary,
|
||||
@@ -749,7 +744,7 @@ impl DeltaLayerInner {
|
||||
path: &Utf8Path,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path, ctx).await {
|
||||
Ok(file) => file,
|
||||
@@ -798,7 +793,7 @@ impl DeltaLayerInner {
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
@@ -829,13 +824,13 @@ impl DeltaLayerInner {
|
||||
|
||||
!blob_ref.will_init()
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
&mut RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
let ctx = &mut RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerValue)
|
||||
.build();
|
||||
|
||||
@@ -894,7 +889,7 @@ impl DeltaLayerInner {
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
@@ -933,9 +928,10 @@ impl DeltaLayerInner {
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
@@ -945,7 +941,7 @@ impl DeltaLayerInner {
|
||||
);
|
||||
let mut result = Vec::new();
|
||||
let mut stream =
|
||||
Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx));
|
||||
Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
@@ -977,12 +973,12 @@ impl DeltaLayerInner {
|
||||
index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
|
||||
mut planner: VectoredReadPlanner,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>>
|
||||
where
|
||||
Reader: BlockReader + Clone,
|
||||
Reader: BlockReader,
|
||||
{
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let mut ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build();
|
||||
|
||||
@@ -990,7 +986,7 @@ impl DeltaLayerInner {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
|
||||
let index_stream = index_reader.clone().into_stream(&start_key.0, &ctx);
|
||||
let index_stream = index_reader.get_stream_from(&start_key.0, &mut ctx);
|
||||
let mut index_stream = std::pin::pin!(index_stream);
|
||||
|
||||
while let Some(index_entry) = index_stream.next().await {
|
||||
@@ -1066,7 +1062,7 @@ impl DeltaLayerInner {
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let mut ignore_key_with_err = None;
|
||||
@@ -1144,7 +1140,7 @@ impl DeltaLayerInner {
|
||||
|
||||
pub(super) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Vec<DeltaEntry<'a>>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
@@ -1183,7 +1179,7 @@ impl DeltaLayerInner {
|
||||
all_keys.push(entry);
|
||||
true
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
&mut RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
@@ -1203,7 +1199,7 @@ impl DeltaLayerInner {
|
||||
&self,
|
||||
writer: &mut DeltaLayerWriter,
|
||||
until: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
|
||||
@@ -1245,7 +1241,7 @@ impl DeltaLayerInner {
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let stream = self.stream_index_forwards(tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
|
||||
let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
|
||||
let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
|
||||
// put in a sentinel value for getting the end offset for last item, and not having to
|
||||
// repeat the whole read part
|
||||
@@ -1304,7 +1300,7 @@ impl DeltaLayerInner {
|
||||
offsets.start.pos(),
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
Some(max_read_size),
|
||||
max_read_size,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
@@ -1391,7 +1387,7 @@ impl DeltaLayerInner {
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub(super) async fn dump(&self, ctx: &mut RequestContext) -> anyhow::Result<()> {
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
self.index_start_blk, self.index_root_blk
|
||||
@@ -1408,7 +1404,7 @@ impl DeltaLayerInner {
|
||||
|
||||
let keys = self.load_keys(ctx).await?;
|
||||
|
||||
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||
async fn dump_blob(val: &ValueRef<'_>, ctx: &mut RequestContext) -> anyhow::Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
@@ -1463,17 +1459,17 @@ impl DeltaLayerInner {
|
||||
|
||||
fn stream_index_forwards<'a, R>(
|
||||
&'a self,
|
||||
reader: DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||
start: &'a [u8; DELTA_KEY_SIZE],
|
||||
ctx: &'a RequestContext,
|
||||
) -> impl futures::stream::Stream<
|
||||
Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
|
||||
> + 'a
|
||||
where
|
||||
R: BlockReader + 'a,
|
||||
R: BlockReader,
|
||||
{
|
||||
use futures::stream::TryStreamExt;
|
||||
let stream = reader.into_stream(start, ctx);
|
||||
let stream = reader.get_stream_from(start, ctx);
|
||||
stream.map_ok(|(key, value)| {
|
||||
let key = DeltaKey::from_slice(&key);
|
||||
let (key, lsn) = (key.key(), key.lsn());
|
||||
@@ -1497,24 +1493,6 @@ impl DeltaLayerInner {
|
||||
);
|
||||
offset
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
DeltaLayerIterator {
|
||||
delta_layer: self,
|
||||
ctx,
|
||||
index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
@@ -1535,7 +1513,7 @@ pub struct ValueRef<'a> {
|
||||
|
||||
impl<'a> ValueRef<'a> {
|
||||
/// Loads the value from disk
|
||||
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
|
||||
pub async fn load(&self, ctx: &mut RequestContext) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
@@ -1549,7 +1527,7 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id);
|
||||
block_reader.read_blk(blknum, ctx).await
|
||||
@@ -1574,70 +1552,6 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct DeltaLayerIterator<'a> {
|
||||
delta_layer: &'a DeltaLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> DeltaLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
assert!(!self.is_end);
|
||||
|
||||
let plan = loop {
|
||||
if let Some(res) = self.index_iter.next().await {
|
||||
let (raw_key, value) = res?;
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
let offset = blob_ref.pos();
|
||||
if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
|
||||
break batch_plan;
|
||||
}
|
||||
} else {
|
||||
self.is_end = true;
|
||||
let data_end_offset = self.delta_layer.index_start_offset();
|
||||
break self.planner.handle_range_end(data_end_offset);
|
||||
}
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = Value::des(&frozen_buf[meta.start..meta.end])?;
|
||||
next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.key_values_batch.is_empty() {
|
||||
if self.is_end {
|
||||
return Ok(None);
|
||||
}
|
||||
self.next_batch().await?;
|
||||
}
|
||||
Ok(Some(
|
||||
self.key_values_batch
|
||||
.pop_front()
|
||||
.expect("should not be empty"),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::collections::BTreeMap;
|
||||
@@ -1647,9 +1561,6 @@ mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
@@ -1946,7 +1857,7 @@ mod test {
|
||||
.finish(entries_meta.key_range.end, &timeline, &ctx)
|
||||
.await?;
|
||||
|
||||
let inner = resident.get_as_delta(&ctx).await?;
|
||||
let inner = resident.as_delta(&ctx).await?;
|
||||
|
||||
let file_size = inner.file.metadata().await?.len();
|
||||
tracing::info!(
|
||||
@@ -2133,11 +2044,11 @@ mod test {
|
||||
|
||||
let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
|
||||
|
||||
copied_layer.get_as_delta(ctx).await.unwrap();
|
||||
copied_layer.as_delta(ctx).await.unwrap();
|
||||
|
||||
assert_keys_and_values_eq(
|
||||
new_layer.get_as_delta(ctx).await.unwrap(),
|
||||
copied_layer.get_as_delta(ctx).await.unwrap(),
|
||||
new_layer.as_delta(ctx).await.unwrap(),
|
||||
copied_layer.as_delta(ctx).await.unwrap(),
|
||||
truncate_at,
|
||||
ctx,
|
||||
)
|
||||
@@ -2149,7 +2060,7 @@ mod test {
|
||||
source: &DeltaLayerInner,
|
||||
truncated: &DeltaLayerInner,
|
||||
truncated_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) {
|
||||
use futures::future::ready;
|
||||
use futures::stream::TryStreamExt;
|
||||
@@ -2162,7 +2073,7 @@ mod test {
|
||||
source.index_root_blk,
|
||||
&source_reader,
|
||||
);
|
||||
let source_stream = source.stream_index_forwards(source_tree, &start_key, ctx);
|
||||
let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
|
||||
let source_stream = source_stream.filter(|res| match res {
|
||||
Ok((_, lsn, _)) => ready(lsn < &truncated_at),
|
||||
_ => ready(true),
|
||||
@@ -2175,7 +2086,7 @@ mod test {
|
||||
truncated.index_root_blk,
|
||||
&truncated_reader,
|
||||
);
|
||||
let truncated_stream = truncated.stream_index_forwards(truncated_tree, &start_key, ctx);
|
||||
let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
|
||||
let mut truncated_stream = std::pin::pin!(truncated_stream);
|
||||
|
||||
let mut scratch_left = Vec::new();
|
||||
@@ -2216,116 +2127,4 @@ mod test {
|
||||
assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
|
||||
}
|
||||
}
|
||||
|
||||
async fn produce_delta_layer(
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
mut deltas: Vec<(Key, Lsn, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
|
||||
let (key_start, _, _) = deltas.first().unwrap();
|
||||
let (key_max, _, _) = deltas.first().unwrap();
|
||||
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||
let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
||||
let lsn_end = Lsn(lsn_max.0 + 1);
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
*key_start,
|
||||
(*lsn_min)..lsn_end,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let key_end = key_max.next();
|
||||
|
||||
for (key, lsn, value) in deltas {
|
||||
writer.put_value(key, lsn, value, ctx).await?;
|
||||
}
|
||||
let delta_layer = writer.finish(key_end, tline, ctx).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(delta_layer)
|
||||
}
|
||||
|
||||
async fn assert_delta_iter_equal(
|
||||
delta_iter: &mut DeltaLayerIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = delta_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
assert_eq!(o1.is_some(), o2.is_some());
|
||||
if o1.is_none() && o2.is_none() {
|
||||
break;
|
||||
}
|
||||
let (k1, l1, v1) = o1.unwrap();
|
||||
let (k2, l2, v2) = o2.unwrap();
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, *l2);
|
||||
assert_eq!(&v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_layer_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("delta_layer_iterator").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 1000;
|
||||
let test_deltas = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32 / 10),
|
||||
Lsn(0x10 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
|
||||
for max_read_size in [1, 1024] {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
let mut iter = delta_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut num_items = 0;
|
||||
for _ in 0..3 {
|
||||
iter.next_batch().await.unwrap();
|
||||
num_items += iter.key_values_batch.len();
|
||||
if max_read_size == 1 {
|
||||
// every key should be a batch b/c the value is larger than max_read_size
|
||||
assert_eq!(iter.key_values_batch.len(), 1);
|
||||
} else {
|
||||
assert_eq!(iter.key_values_batch.len(), batch_size);
|
||||
}
|
||||
if num_items >= N {
|
||||
break;
|
||||
}
|
||||
iter.key_values_batch.clear();
|
||||
}
|
||||
// Test if the result is correct
|
||||
let mut iter = delta_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
assert_delta_iter_equal(&mut iter, &test_deltas).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -165,7 +165,6 @@ pub struct ImageLayerInner {
|
||||
file_id: FileId,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
compressed_reads: bool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
@@ -178,9 +177,8 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
pub(super) async fn dump(&self, ctx: &mut RequestContext) -> anyhow::Result<()> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
@@ -219,7 +217,7 @@ impl AsLayerDesc for ImageLayer {
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &mut RequestContext) -> Result<()> {
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
@@ -256,7 +254,7 @@ impl ImageLayer {
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
self.inner
|
||||
@@ -265,13 +263,12 @@ impl ImageLayer {
|
||||
.with_context(|| format!("Failed to load image layer {}", self.path()))
|
||||
}
|
||||
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
async fn load_inner(&self, ctx: &mut RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded =
|
||||
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
// not production code
|
||||
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
|
||||
@@ -339,7 +336,7 @@ impl ImageLayer {
|
||||
pub async fn rewrite_summary<F>(
|
||||
path: &Utf8Path,
|
||||
rewrite: F,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), RewriteSummaryError>
|
||||
where
|
||||
F: Fn(Summary) -> Summary,
|
||||
@@ -380,8 +377,7 @@ impl ImageLayerInner {
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
support_compressed_reads: bool,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path, ctx).await {
|
||||
Ok(file) => file,
|
||||
@@ -424,7 +420,6 @@ impl ImageLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
compressed_reads: support_compressed_reads,
|
||||
key_range: actual_summary.key_range,
|
||||
}))
|
||||
}
|
||||
@@ -433,10 +428,9 @@ impl ImageLayerInner {
|
||||
&self,
|
||||
key: Key,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
|
||||
@@ -445,7 +439,7 @@ impl ImageLayerInner {
|
||||
if let Some(offset) = tree_reader
|
||||
.get(
|
||||
&keybuf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
&mut RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
@@ -455,7 +449,7 @@ impl ImageLayerInner {
|
||||
.block_cursor()
|
||||
.read_blob(
|
||||
offset,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
&mut RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerValue)
|
||||
.build(),
|
||||
)
|
||||
@@ -476,7 +470,7 @@ impl ImageLayerInner {
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let reads = self
|
||||
.plan_reads(keyspace, None, ctx)
|
||||
@@ -492,18 +486,17 @@ impl ImageLayerInner {
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
let mut result = Vec::new();
|
||||
let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
while let Some(item) = stream.next().await {
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
@@ -529,7 +522,7 @@ impl ImageLayerInner {
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
shard_identity: Option<&ShardIdentity>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(
|
||||
self.max_vectored_read_bytes
|
||||
@@ -538,12 +531,11 @@ impl ImageLayerInner {
|
||||
.into(),
|
||||
);
|
||||
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let mut ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build();
|
||||
|
||||
@@ -552,7 +544,7 @@ impl ImageLayerInner {
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
let index_stream = tree_reader.clone().into_stream(&search_key, &ctx);
|
||||
let index_stream = tree_reader.get_stream_from(&search_key, &mut ctx);
|
||||
let mut index_stream = std::pin::pin!(index_stream);
|
||||
|
||||
while let Some(index_entry) = index_stream.next().await {
|
||||
@@ -595,7 +587,7 @@ impl ImageLayerInner {
|
||||
&self,
|
||||
shard_identity: &ShardIdentity,
|
||||
writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
// Fragment the range into the regions owned by this ShardIdentity
|
||||
let plan = self
|
||||
@@ -637,7 +629,7 @@ impl ImageLayerInner {
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) {
|
||||
let max_vectored_read_bytes = self
|
||||
.max_vectored_read_bytes
|
||||
@@ -697,25 +689,6 @@ impl ImageLayerInner {
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
ImageLayerIterator {
|
||||
image_layer: self,
|
||||
ctx,
|
||||
index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
@@ -751,7 +724,7 @@ impl ImageLayerWriterInner {
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
@@ -806,7 +779,7 @@ impl ImageLayerWriterInner {
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
|
||||
@@ -826,7 +799,7 @@ impl ImageLayerWriterInner {
|
||||
async fn finish(
|
||||
self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
@@ -926,7 +899,7 @@ impl ImageLayerWriter {
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(
|
||||
@@ -945,7 +918,7 @@ impl ImageLayerWriter {
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
@@ -956,7 +929,7 @@ impl ImageLayerWriter {
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner.take().unwrap().finish(timeline, ctx).await
|
||||
}
|
||||
@@ -970,77 +943,11 @@ impl Drop for ImageLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct ImageLayerIterator<'a> {
|
||||
image_layer: &'a ImageLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> ImageLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
assert!(!self.is_end);
|
||||
|
||||
let plan = loop {
|
||||
if let Some(res) = self.index_iter.next().await {
|
||||
let (raw_key, offset) = res?;
|
||||
if let Some(batch_plan) = self.planner.handle(
|
||||
Key::from_slice(&raw_key[..KEY_SIZE]),
|
||||
self.image_layer.lsn,
|
||||
offset,
|
||||
BlobFlag::None,
|
||||
) {
|
||||
break batch_plan;
|
||||
}
|
||||
} else {
|
||||
self.is_end = true;
|
||||
let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
break self.planner.handle_range_end(payload_end);
|
||||
}
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf: Bytes = blobs_buf.buf.freeze();
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.key_values_batch.is_empty() {
|
||||
if self.is_end {
|
||||
return Ok(None);
|
||||
}
|
||||
self.next_batch().await?;
|
||||
}
|
||||
Ok(Some(
|
||||
self.key_values_batch
|
||||
.pop_front()
|
||||
.expect("should not be empty"),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
@@ -1052,19 +959,11 @@ mod test {
|
||||
};
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
repository::Value,
|
||||
tenant::{
|
||||
config::TenantConf,
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::ResidentLayer,
|
||||
vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
Tenant, Timeline,
|
||||
},
|
||||
tenant::{config::TenantConf, harness::TenantHarness},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
use super::{ImageLayerIterator, ImageLayerWriter};
|
||||
use super::ImageLayerWriter;
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_rewrite() {
|
||||
@@ -1235,111 +1134,4 @@ mod test {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn produce_image_layer(
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
mut images: Vec<(Key, Bytes)>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
images.sort();
|
||||
let (key_start, _) = images.first().unwrap();
|
||||
let (key_last, _) = images.last().unwrap();
|
||||
let key_end = key_last.next();
|
||||
let key_range = *key_start..key_end;
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
&key_range,
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for (key, img) in images {
|
||||
writer.put_image(key, img, ctx).await?;
|
||||
}
|
||||
let img_layer = writer.finish(tline, ctx).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(img_layer)
|
||||
}
|
||||
|
||||
async fn assert_img_iter_equal(
|
||||
img_iter: &mut ImageLayerIterator<'_>,
|
||||
expect: &[(Key, Bytes)],
|
||||
expect_lsn: Lsn,
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = img_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
match (o1, o2) {
|
||||
(None, None) => break,
|
||||
(Some((k1, l1, v1)), Some((k2, i2))) => {
|
||||
let Value::Image(i1) = v1 else {
|
||||
panic!("expect Value::Image")
|
||||
};
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, expect_lsn);
|
||||
assert_eq!(&i1, i2);
|
||||
}
|
||||
(o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_iterator() {
|
||||
let harness = TenantHarness::create("image_layer_iterator").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 1000;
|
||||
let test_imgs = (0..N)
|
||||
.map(|idx| (get_key(idx as u32), Bytes::from(format!("img{idx:05}"))))
|
||||
.collect_vec();
|
||||
let resident_layer =
|
||||
produce_image_layer(&tenant, &tline, test_imgs.clone(), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
|
||||
for max_read_size in [1, 1024] {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
let mut iter = img_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut num_items = 0;
|
||||
for _ in 0..3 {
|
||||
iter.next_batch().await.unwrap();
|
||||
num_items += iter.key_values_batch.len();
|
||||
if max_read_size == 1 {
|
||||
// every key should be a batch b/c the value is larger than max_read_size
|
||||
assert_eq!(iter.key_values_batch.len(), 1);
|
||||
} else {
|
||||
assert_eq!(iter.key_values_batch.len(), batch_size);
|
||||
}
|
||||
if num_items >= N {
|
||||
break;
|
||||
}
|
||||
iter.key_values_batch.clear();
|
||||
}
|
||||
// Test if the result is correct
|
||||
let mut iter = img_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,14 +6,13 @@
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use crate::{page_cache, walrecord};
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
@@ -257,7 +256,7 @@ impl InMemoryLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &mut RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_str = self.end_lsn_or_max();
|
||||
@@ -309,12 +308,12 @@ impl InMemoryLayer {
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let mut ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
@@ -326,7 +325,7 @@ impl InMemoryLayer {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos, &ctx).await?;
|
||||
let buf = reader.read_blob(*pos, &mut ctx).await?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
@@ -366,9 +365,9 @@ impl InMemoryLayer {
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let mut ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
@@ -411,8 +410,7 @@ impl InMemoryLayer {
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
|
||||
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
|
||||
let buf = reader.read_blob(block_read.block_offset, &mut ctx).await;
|
||||
if let Err(e) = buf {
|
||||
reconstruct_state
|
||||
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
|
||||
@@ -475,7 +473,7 @@ impl InMemoryLayer {
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
@@ -514,7 +512,7 @@ impl InMemoryLayer {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
@@ -527,7 +525,7 @@ impl InMemoryLayer {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
|
||||
@@ -536,7 +534,7 @@ impl InMemoryLayer {
|
||||
.file
|
||||
.write_blob(
|
||||
buf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
&mut RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
)
|
||||
@@ -608,7 +606,7 @@ impl InMemoryLayer {
|
||||
pub(crate) async fn write_to_disk(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
key_range: Option<Range<Key>>,
|
||||
) -> Result<Option<ResidentLayer>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
@@ -622,25 +620,20 @@ impl InMemoryLayer {
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match &*l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
|
||||
let end_lsn = *self.end_lsn.get().unwrap();
|
||||
|
||||
let key_count = if let Some(key_range) = key_range {
|
||||
let keys: Vec<_> = if let Some(key_range) = key_range {
|
||||
inner
|
||||
.index
|
||||
.iter()
|
||||
.filter(|(k, _)| key_range.contains(k))
|
||||
.count()
|
||||
.map(|(k, m)| (k.to_i128(), m))
|
||||
.collect()
|
||||
} else {
|
||||
inner.index.len()
|
||||
inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
|
||||
};
|
||||
if key_count == 0 {
|
||||
|
||||
if keys.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
@@ -654,77 +647,28 @@ impl InMemoryLayer {
|
||||
)
|
||||
.await?;
|
||||
|
||||
match &*l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
}
|
||||
}
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
assert_eq!(
|
||||
file_contents.len() % PAGE_SZ,
|
||||
0,
|
||||
"needed by BlockReaderRef::Slice"
|
||||
);
|
||||
assert_eq!(file_contents.len(), {
|
||||
let written = usize::try_from(inner.file.len()).unwrap();
|
||||
if written % PAGE_SZ == 0 {
|
||||
written
|
||||
} else {
|
||||
written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
|
||||
}
|
||||
});
|
||||
|
||||
let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
// TODO: once we have blob lengths in the in-memory index, we can
|
||||
// 1. get rid of the blob_io / BlockReaderRef::Slice business and
|
||||
// 2. load the file contents into a Bytes and
|
||||
// 3. the use `Bytes::slice` to get the `buf` that is our blob
|
||||
// 4. pass that `buf` into `put_value_bytes`
|
||||
// => https://github.com/neondatabase/neon/issues/8183
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(*key, *lsn, buf, will_init, ctx)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
}
|
||||
|
||||
// Hold the permit until the IO is done; if we didn't, one could drop this future,
|
||||
// thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
|
||||
// => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
|
||||
drop(_concurrency_permit);
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
}
|
||||
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
|
||||
Ok(Some(delta_layer))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,12 +93,16 @@ pub(crate) struct Layer(Arc<LayerInner>);
|
||||
|
||||
impl std::fmt::Display for Layer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}{}",
|
||||
self.layer_desc().short_id(),
|
||||
self.0.generation.get_suffix()
|
||||
)
|
||||
if matches!(self.0.generation, Generation::Broken) {
|
||||
write!(f, "{}-broken", self.layer_desc().short_id())
|
||||
} else {
|
||||
write!(
|
||||
f,
|
||||
"{}{}",
|
||||
self.layer_desc().short_id(),
|
||||
self.0.generation.get_suffix()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -327,7 +331,7 @@ impl Layer {
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use anyhow::ensure;
|
||||
|
||||
@@ -357,7 +361,7 @@ impl Layer {
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let layer = self
|
||||
.0
|
||||
@@ -385,9 +389,10 @@ impl Layer {
|
||||
}
|
||||
|
||||
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
|
||||
let layer = self
|
||||
.0
|
||||
@@ -474,7 +479,7 @@ impl Layer {
|
||||
|
||||
/// Traditional debug dumping facility
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &mut RequestContext) -> anyhow::Result<()> {
|
||||
self.0.desc.dump();
|
||||
|
||||
if verbose {
|
||||
@@ -893,7 +898,7 @@ impl LayerInner {
|
||||
async fn get_or_maybe_download(
|
||||
self: &Arc<Self>,
|
||||
allow_download: bool,
|
||||
ctx: Option<&RequestContext>,
|
||||
ctx: Option<&mut RequestContext>,
|
||||
) -> Result<Arc<DownloadedLayer>, DownloadError> {
|
||||
let (weak, permit) = {
|
||||
// get_or_init_detached can:
|
||||
@@ -983,7 +988,7 @@ impl LayerInner {
|
||||
return Err(DownloadError::NotFile(ft));
|
||||
}
|
||||
|
||||
if let Some(ctx) = ctx {
|
||||
if let Some(ref ctx) = ctx {
|
||||
self.check_expected_download(ctx)?;
|
||||
}
|
||||
|
||||
@@ -1044,7 +1049,7 @@ impl LayerInner {
|
||||
self: &Arc<Self>,
|
||||
timeline: Arc<Timeline>,
|
||||
permit: heavier_once_cell::InitPermit,
|
||||
ctx: RequestContext,
|
||||
mut ctx: RequestContext,
|
||||
) -> Result<Arc<DownloadedLayer>, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -1074,7 +1079,7 @@ impl LayerInner {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let res = this.download_and_init(timeline, permit, &ctx).await;
|
||||
let res = this.download_and_init(timeline, permit, &mut ctx).await;
|
||||
|
||||
if let Err(res) = tx.send(res) {
|
||||
match res {
|
||||
@@ -1096,10 +1101,19 @@ impl LayerInner {
|
||||
|
||||
match rx.await {
|
||||
Ok(Ok(res)) => Ok(res),
|
||||
Ok(Err(remote_storage::DownloadError::Cancelled)) => {
|
||||
Err(DownloadError::DownloadCancelled)
|
||||
Ok(Err(e)) => {
|
||||
// sleep already happened in the spawned task, if it was not cancelled
|
||||
match e.downcast_ref::<remote_storage::DownloadError>() {
|
||||
// If the download failed due to its cancellation token,
|
||||
// propagate the cancellation error upstream.
|
||||
Some(remote_storage::DownloadError::Cancelled) => {
|
||||
Err(DownloadError::DownloadCancelled)
|
||||
}
|
||||
// FIXME: this is not embedding the error because historically it would had
|
||||
// been output to compute, however that is no longer the case.
|
||||
_ => Err(DownloadError::DownloadFailed),
|
||||
}
|
||||
}
|
||||
Ok(Err(_)) => Err(DownloadError::DownloadFailed),
|
||||
Err(_gone) => Err(DownloadError::DownloadCancelled),
|
||||
}
|
||||
}
|
||||
@@ -1108,8 +1122,8 @@ impl LayerInner {
|
||||
self: &Arc<LayerInner>,
|
||||
timeline: Arc<Timeline>,
|
||||
permit: heavier_once_cell::InitPermit,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Arc<DownloadedLayer>> {
|
||||
let result = timeline
|
||||
.remote_client
|
||||
.download_layer_file(
|
||||
@@ -1648,9 +1662,9 @@ impl DownloadedLayer {
|
||||
async fn get<'a>(
|
||||
&'a self,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<&'a LayerKind> {
|
||||
let init = || async {
|
||||
let init = async {
|
||||
assert_eq!(
|
||||
Weak::as_ptr(&self.owner),
|
||||
Arc::as_ptr(owner),
|
||||
@@ -1685,7 +1699,6 @@ impl DownloadedLayer {
|
||||
lsn,
|
||||
summary,
|
||||
Some(owner.conf.max_vectored_read_bytes),
|
||||
owner.conf.image_compression.allow_decompression(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1706,7 +1719,7 @@ impl DownloadedLayer {
|
||||
}
|
||||
};
|
||||
self.kind
|
||||
.get_or_try_init(init)
|
||||
.get_or_try_init(move || init)
|
||||
// return transient errors using `?`
|
||||
.await?
|
||||
.as_ref()
|
||||
@@ -1723,7 +1736,7 @@ impl DownloadedLayer {
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use LayerKind::*;
|
||||
|
||||
@@ -1745,7 +1758,7 @@ impl DownloadedLayer {
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
use LayerKind::*;
|
||||
|
||||
@@ -1761,10 +1774,11 @@ impl DownloadedLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn load_key_values(
|
||||
&self,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
|
||||
use LayerKind::*;
|
||||
|
||||
@@ -1774,7 +1788,7 @@ impl DownloadedLayer {
|
||||
}
|
||||
}
|
||||
|
||||
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &mut RequestContext) -> anyhow::Result<()> {
|
||||
use LayerKind::*;
|
||||
match self.get(owner, ctx).await? {
|
||||
Delta(d) => d.dump(ctx).await?,
|
||||
@@ -1823,7 +1837,7 @@ impl ResidentLayer {
|
||||
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
|
||||
pub(crate) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
|
||||
use LayerKind::*;
|
||||
|
||||
@@ -1852,7 +1866,7 @@ impl ResidentLayer {
|
||||
&'a self,
|
||||
shard_identity: &ShardIdentity,
|
||||
writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
use LayerKind::*;
|
||||
|
||||
@@ -1867,7 +1881,7 @@ impl ResidentLayer {
|
||||
&self,
|
||||
writer: &mut super::delta_layer::DeltaLayerWriter,
|
||||
until: Lsn,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
use LayerKind::*;
|
||||
|
||||
@@ -1891,9 +1905,9 @@ impl ResidentLayer {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn get_as_delta(
|
||||
pub(crate) async fn as_delta(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
ctx: &mut RequestContext,
|
||||
) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
|
||||
use LayerKind::*;
|
||||
match self.downloaded.get(&self.owner.0, ctx).await? {
|
||||
@@ -1901,18 +1915,6 @@ impl ResidentLayer {
|
||||
Image(_) => Err(anyhow::anyhow!("image layer")),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn get_as_image(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<&image_layer::ImageLayerInner> {
|
||||
use LayerKind::*;
|
||||
match self.downloaded.get(&self.owner.0, ctx).await? {
|
||||
Image(ref d) => Ok(d),
|
||||
Delta(_) => Err(anyhow::anyhow!("delta layer")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ResidentLayer {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user