Compare commits

..

30 Commits

Author SHA1 Message Date
Rahul Patil
fe5909ee63 use from main 2024-09-17 11:40:27 +02:00
Rahul Patil
25c82f0f63 fix docker installation 2024-09-17 11:40:15 +02:00
Rahul Patil
f6adc2941f use bookworm 2024-09-17 11:40:15 +02:00
Rahul Patil
35c19b04e4 install requires https package 2024-09-17 11:40:15 +02:00
Rahul Patil
f3763ffbb6 use global shell 2024-09-17 11:40:15 +02:00
Rahul Patil
732507dda4 Trigger Build to re-run build 2024-09-17 11:40:15 +02:00
Rahul Patil
48e320a25a Update neon artifact name to include bookworm 2024-09-17 11:40:15 +02:00
Rahul Patil
62de4c4a14 update missing packages 2024-09-17 11:39:25 +02:00
Rahul Patil
0256640050 upgrade plv8 to fix build 2024-09-17 11:39:25 +02:00
Rahul Patil
18e2ab54ad update base image for vm-impage 2024-09-17 11:39:25 +02:00
Rahul Patil
39bbac63bb try new version of plv8 2024-09-17 11:39:21 +02:00
Rahul Patil
f54f3c217b trigger build extention 2024-09-17 11:38:58 +02:00
Rahul Patil
b409133a42 use different cache key for pyenv 2024-09-17 11:38:58 +02:00
Rahul Patil
b41ef842ea try with patch version 2024-09-17 11:38:49 +02:00
Rahul Patil
3c619d23cb add new image suffix and update cache key 2024-09-17 11:38:26 +02:00
Rahul Patil
7b314dd750 update cache key 2024-09-17 11:38:26 +02:00
Rahul Patil
fcfa9164e3 check on other arch 2024-09-17 11:38:26 +02:00
Rahul Patil
43fe1facf0 update cache key for new postgres new build 2024-09-17 11:38:26 +02:00
Rahul Patil
7a5492b4e0 disable custom extention to see other build status 2024-09-17 11:38:23 +02:00
Rahul Patil
1f9aaf026e revert 2024-09-17 11:38:02 +02:00
Rahul Patil
d2f90d06f2 revert last change 2024-09-17 11:38:02 +02:00
Rahul Patil
734f0d8493 revert change for build tools, compute-node and other
try to build only base dockerfile
2024-09-17 11:38:02 +02:00
Rahul Patil
3ba2623bbd temp change cache key to test build 2024-09-17 11:38:02 +02:00
Rahul Patil
e599777b22 disable other build to fail fast 2024-09-17 11:37:57 +02:00
Rahul Patil
69122999e3 use latest version for SFCGAL 2024-09-17 11:37:23 +02:00
Rahul Patil
afa3d4ea74 temporary use different cache key for test 2024-09-17 11:36:50 +02:00
Rahul Patil
9567cfcbb8 update libicu package 2024-09-17 11:36:50 +02:00
Rahul Patil
33d79cbc2c fix build extension 2024-09-17 11:36:44 +02:00
Rahul Patil
55356ced61 fix missing packages 2024-09-17 11:36:04 +02:00
Rahul Patil
34c697a65e wip: test with changing base image to bookworm-slim 2024-09-17 11:36:00 +02:00
134 changed files with 2310 additions and 2185 deletions

View File

@@ -183,7 +183,7 @@ runs:
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
key: v2-${{ runner.os }}-temp-python-deps-${{ hashFiles('poetry.lock') }}
- name: Store Allure test stat in the DB (new)
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}

View File

@@ -1,53 +1,53 @@
name: 'Run python test'
description: 'Runs a Neon python test set, performing all the required preparations before'
name: "Run python test"
description: "Runs a Neon python test set, performing all the required preparations before"
inputs:
build_type:
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster'
required: true
test_selection:
description: 'A python test suite to run'
description: "A python test suite to run"
required: true
extra_params:
description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
required: false
default: ''
default: ""
needs_postgres_source:
description: 'Set to true if the test suite requires postgres source checked out'
description: "Set to true if the test suite requires postgres source checked out"
required: false
default: 'false'
default: "false"
run_in_parallel:
description: 'Whether to run tests in parallel'
description: "Whether to run tests in parallel"
required: false
default: 'true'
default: "true"
save_perf_report:
description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set'
description: "Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set"
required: false
default: 'false'
default: "false"
run_with_real_s3:
description: 'Whether to pass real s3 credentials to the test suite'
description: "Whether to pass real s3 credentials to the test suite"
required: false
default: 'false'
default: "false"
real_s3_bucket:
description: 'Bucket name for real s3 tests'
description: "Bucket name for real s3 tests"
required: false
default: ''
default: ""
real_s3_region:
description: 'Region name for real s3 tests'
description: "Region name for real s3 tests"
required: false
default: ''
default: ""
rerun_flaky:
description: 'Whether to rerun flaky tests'
description: "Whether to rerun flaky tests"
required: false
default: 'false'
default: "false"
pg_version:
description: 'Postgres version to use for tests'
description: "Postgres version to use for tests"
required: false
default: 'v16'
default: "v16"
benchmark_durations:
description: 'benchmark durations JSON'
description: "benchmark durations JSON"
required: false
default: '{}'
default: "{}"
runs:
using: "composite"
@@ -56,14 +56,14 @@ runs:
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
name: neon-${{ runner.os }}-bookworm-${{ inputs.build_type }}-artifact
path: /tmp/neon
- name: Download Neon binaries for the previous release
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
name: neon-${{ runner.os }}-bookworm-${{ inputs.build_type }}-artifact
path: /tmp/neon-previous
prefix: latest
@@ -88,7 +88,7 @@ runs:
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
key: v2-${{ runner.os }}-temp-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
shell: bash -euxo pipefail {0}

View File

@@ -26,9 +26,15 @@ on:
description: Azure tenant ID
required: true
type: string
skip_if:
description: Skip the job if this expression is true
required: true
type: boolean
jobs:
push-to-acr:
if: ${{ !inputs.skip_if }}
runs-on: ubuntu-22.04
permissions:
contents: read # This is required for actions/checkout

View File

@@ -54,8 +54,8 @@ jobs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -106,7 +106,7 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
key: v2-${{ runner.os }}-temp-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
@@ -120,59 +120,6 @@ jobs:
- name: Run mypy to check types
run: poetry run mypy .
# Check that the vendor/postgres-* submodules point to the
# corresponding REL_*_STABLE_neon branches.
check-submodules:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- uses: dorny/paths-filter@v3
id: check-if-submodules-changed
with:
filters: |
vendor:
- 'vendor/**'
- name: Check vendor/postgres-v14 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v14"
fetch_depth: "50"
sub_fetch_depth: "50"
pass_if_unchanged: true
- name: Check vendor/postgres-v15 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v15"
fetch_depth: "50"
sub_fetch_depth: "50"
pass_if_unchanged: true
- name: Check vendor/postgres-v16 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v16"
fetch_depth: "50"
sub_fetch_depth: "50"
pass_if_unchanged: true
- name: Check vendor/postgres-v17 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v17"
fetch_depth: "50"
sub_fetch_depth: "50"
pass_if_unchanged: true
check-codestyle-rust:
needs: [ check-permissions, build-build-tools-image ]
strategy:
@@ -212,10 +159,6 @@ jobs:
# This will catch compiler & clippy warnings in all feature combinations.
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
# NB: keep clippy args in sync with ./run_clippy.sh
#
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
# time just for that, so skip "clippy --release".
- run: |
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
@@ -225,6 +168,8 @@ jobs:
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
@@ -253,21 +198,266 @@ jobs:
strategy:
fail-fast: false
matrix:
arch: [ x64, arm64 ]
# Do not build or run tests in debug for release branches
build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
include:
- build-type: release
arch: arm64
uses: ./.github/workflows/_build-and-test-locally.yml
with:
arch: ${{ matrix.arch }}
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
secrets: inherit
build_type: [ debug, release ]
env:
BUILD_TYPE: ${{ matrix.build_type }}
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
# Set some environment variables used by all the steps.
#
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
# It also includes --features, if any
#
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
# because "cargo metadata" doesn't accept --release or --debug options
#
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
# corresponding Cargo.toml files for their descriptions.
- name: Set env variables
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FLAGS="--locked"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FLAGS="--locked --release"
fi
{
echo "cov_prefix=${cov_prefix}"
echo "CARGO_FEATURES=${CARGO_FEATURES}"
echo "CARGO_FLAGS=${CARGO_FLAGS}"
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
} >> $GITHUB_ENV
# Disabled for now
# Don't include the ~/.cargo/registry/src directory. It contains just
# uncompressed versions of the crates in ~/.cargo/registry/cache
# directory, and it's faster to let 'cargo' to rebuild it from the
# compressed crates.
# - name: Cache cargo deps
# id: cache_cargo
# uses: actions/cache@v4
# with:
# path: |
# ~/.cargo/registry/
# !~/.cargo/registry/src
# ~/.cargo/git/
# target/
# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
# key: |
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-temp-v2-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-temp-v2-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-temp-v2-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
- name: Build walproposer-lib
run: mold -run make walproposer-lib -j$(nproc)
- name: Run cargo build
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run rust tests
env:
NEXTEST_RETRIES: 3
run: |
#nextest does not yet support running doctests
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
- name: Install rust binaries
run: |
# Install target binaries
mkdir -p /tmp/neon/bin/
binaries=$(
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
for bin in $binaries; do
SRC=target/$BUILD_TYPE/$bin
DST=/tmp/neon/bin/$bin
cp "$SRC" "$DST"
done
# Install test executables and write list of all binaries (for code coverage)
if [[ $BUILD_TYPE == "debug" ]]; then
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
mkdir -p /tmp/neon/test_bin/
test_exe_paths=$(
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
for bin in $test_exe_paths; do
SRC=$bin
DST=/tmp/neon/test_bin/$(basename $bin)
# We don't need debug symbols for code coverage, so strip them out to make
# the artifact smaller.
strip "$SRC" -o "$DST"
echo "$DST" >> /tmp/coverage/binaries.list
done
for bin in $binaries; do
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
fi
- name: Install postgres binaries
run: cp -a pg_install /tmp/neon/pg_install
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-bookworm-${{ matrix.build_type }}-artifact
path: /tmp/neon
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
pg_version: [ v14, v15, v16 ]
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
with:
build_type: ${{ matrix.build_type }}
test_selection: regress
needs_postgres_source: true
run_with_real_s3: true
real_s3_bucket: neon-github-ci-tests
real_s3_region: eu-central-1
rerun_flaky: true
pg_version: ${{ matrix.pg_version }}
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: true
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
- name: Merge and upload coverage data
if: |
false &&
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
uses: ./.github/actions/save-coverage-data
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
get-benchmarks-durations:
@@ -429,8 +619,8 @@ jobs:
coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
steps:
# Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
@@ -438,7 +628,7 @@ jobs:
- name: Get Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
name: neon-${{ runner.os }}-bookworm-${{ matrix.build_type }}-artifact
path: /tmp/neon
- name: Get coverage artifact
@@ -531,9 +721,11 @@ jobs:
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/setup-buildx-action@v3
@@ -602,15 +794,18 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16, v17 ]
version: [ v14 ]
# version: [ v14, v15, v16 ]
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/setup-buildx-action@v3
@@ -757,7 +952,10 @@ jobs:
VM_BUILDER_VERSION: v0.29.3
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Downloading vm-builder
run: |
@@ -797,7 +995,10 @@ jobs:
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
steps:
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
- uses: docker/login-action@v3
@@ -904,7 +1105,6 @@ jobs:
done
push-to-acr-dev:
if: github.ref_name == 'main'
needs: [ tag, promote-images ]
uses: ./.github/workflows/_push-to-acr.yml
with:
@@ -914,9 +1114,9 @@ jobs:
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
skip_if: ${{ github.ref_name != 'main' }}
push-to-acr-prod:
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
needs: [ tag, promote-images ]
uses: ./.github/workflows/_push-to-acr.yml
with:
@@ -926,6 +1126,7 @@ jobs:
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
skip_if: ${{ !startsWith(github.ref_name, 'release') }}
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]
@@ -1003,8 +1204,7 @@ jobs:
deploy:
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
runs-on: [ self-hosted, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1023,7 +1223,10 @@ jobs:
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Trigger deploy workflow
env:
@@ -1102,8 +1305,7 @@ jobs:
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ deploy ]
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: github.ref_name == 'release' && !failure() && !cancelled()
if: github.ref_name == 'release'
runs-on: ubuntu-22.04
steps:
@@ -1181,8 +1383,18 @@ jobs:
done
done
for f in "${files_to_promote[@]}"; do
time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
# Update Neon artifact for the release (reuse already uploaded artifact)
for build_type in debug release; do
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=neon-${{ runner.os }}-bookworm-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done
pin-build-tools-image:
@@ -1207,8 +1419,6 @@ jobs:
# Usually we do `needs: [...]`
needs:
- build-and-test-locally
# XXX: Temporarily disabled, while we investigate an unexpected failure with it
#- check-submodules
- check-codestyle-python
- check-codestyle-rust
- promote-images

View File

@@ -40,7 +40,8 @@ jobs:
- name: Check if such tag found in the registry
id: check-image
env:
IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
# TODO: temp set -new suffix to create new build tool image
IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}-new
run: |
if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
found=true

View File

@@ -81,21 +81,21 @@ jobs:
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-temp-v2-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-temp-v2-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-temp-v2-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v17 build
id: cache_pg_17
@@ -131,10 +131,6 @@ jobs:
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
- name: Build postgres v17
if: steps.cache_pg_17.outputs.cache-hit != 'true'
run: make postgres-v17 -j$(sysctl -n hw.ncpu)
- name: Build neon extensions
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
@@ -147,6 +143,214 @@ jobs:
- name: Check that no warnings are produced
run: ./run_clippy.sh
check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
CARGO_FEATURES: --features testing
CARGO_FLAGS: --release
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Set env variables
run: |
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-temp-v2-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-temp-v2-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-temp-v2-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
- name: Build walproposer-lib
run: mold -run make walproposer-lib -j$(nproc)
- name: Run cargo build
run: |
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
cargo nextest run $CARGO_FEATURES -j$(nproc)
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
# This will catch compiler & clippy warnings in all feature combinations.
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
# NB: keep clippy args in sync with ./run_clippy.sh
- run: |
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
echo "No clippy args found in .neon_clippy_args"
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
if: matrix.build_type == 'debug'
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
if: matrix.build_type == 'release'
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
if: matrix.build_type == 'release'
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() && matrix.build_type == 'release' }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() && matrix.build_type == 'release' }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() && matrix.build_type == 'release' }}
run: cargo deny check
gather-rust-build-stats:
needs: [ check-permissions, build-build-tools-image ]
if: |

View File

@@ -34,8 +34,8 @@ jobs:
build-tag: ${{ steps.build-tag.outputs.tag }}
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

4
.gitmodules vendored
View File

@@ -1,11 +1,11 @@
[submodule "vendor/postgres-v14"]
path = vendor/postgres-v14
url = https://github.com/neondatabase/postgres.git
branch = REL_14_STABLE_neon
branch = fix/warning-in-guc-var-14
[submodule "vendor/postgres-v15"]
path = vendor/postgres-v15
url = https://github.com/neondatabase/postgres.git
branch = REL_15_STABLE_neon
branch = fix/warning-in-guc-var-15
[submodule "vendor/postgres-v16"]
path = vendor/postgres-v16
url = https://github.com/neondatabase/postgres.git

451
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -76,6 +76,8 @@ clap = { version = "4.0", features = ["derive"] }
comfy-table = "7.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-deque = "0.8.5"
crossbeam-utils = "0.8.5"
dashmap = { version = "5.5.0", features = ["raw-api"] }
either = "1.8"
enum-map = "2.4.2"
@@ -93,7 +95,7 @@ hdrhistogram = "7.5.2"
hex = "0.4"
hex-literal = "0.4"
hmac = "0.12.1"
hostname = "0.4"
hostname = "0.3.1"
http = {version = "1.1.0", features = ["std"]}
http-types = { version = "2", default-features = false }
humantime = "2.1"
@@ -102,6 +104,7 @@ hyper = "0.14"
tokio-tungstenite = "0.20.0"
indexmap = "2"
indoc = "2"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
jsonwebtoken = "9"
@@ -110,7 +113,7 @@ libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
memoffset = "0.9"
memoffset = "0.8"
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
notify = "6.0.0"
num_cpus = "1.15"
@@ -139,6 +142,7 @@ rpds = "0.13"
rustc-hash = "1.1.0"
rustls = "0.22"
rustls-pemfile = "2"
rustls-split = "0.3"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
@@ -160,6 +164,7 @@ strum_macros = "0.26"
svg_fmt = "0.4.3"
sync_wrapper = "0.1.2"
tar = "0.4"
task-local-extensions = "0.1.4"
test-context = "0.3"
thiserror = "1.0"
tikv-jemallocator = "0.5"

View File

@@ -43,30 +43,31 @@ COPY --chown=nonroot . .
ARG ADDITIONAL_RUSTFLAGS
RUN set -e \
&& PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \
--bin safekeeper \
--bin storage_broker \
--bin storage_controller \
--bin proxy \
--bin neon_local \
--bin storage_scrubber \
--locked --release
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \
--bin safekeeper \
--bin storage_broker \
--bin storage_controller \
--bin proxy \
--bin neon_local \
--locked --release \
&& cachepot -s
# Build final image
#
FROM debian:bullseye-slim
ARG DEFAULT_PG_VERSION
FROM debian:bookworm-slim
WORKDIR /data
RUN set -e \
&& apt update \
&& apt install -y \
libreadline-dev \
libseccomp-dev \
ca-certificates \
libreadline-dev \
libseccomp-dev \
libicu72 \
openssl \
ca-certificates \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& useradd -d /data neon \
&& chown -R neon:neon /data
@@ -89,15 +90,13 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
RUN mkdir -p /data/.neon/ && \
echo "id=1234" > "/data/.neon/identity.toml" && \
echo "broker_endpoint='http://storage_broker:50051'\n" \
"pg_distrib_dir='/usr/local/'\n" \
"listen_pg_addr='0.0.0.0:6400'\n" \
"listen_http_addr='0.0.0.0:9898'\n" \
"availability_zone='local'\n" \
> /data/.neon/pageserver.toml && \
chown -R neon:neon /data/.neon
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
-c "id=1234" \
-c "broker_endpoint='http://storage_broker:50051'" \
-c "pg_distrib_dir='/usr/local/'" \
-c "listen_pg_addr='0.0.0.0:6400'" \
-c "listen_http_addr='0.0.0.0:9898'"
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default.

View File

@@ -1,4 +1,4 @@
FROM debian:bullseye-slim
FROM debian:bookworm-slim
# Use ARG as a build-time environment variable here to allow.
# It's not supposed to be set outside.
@@ -6,7 +6,7 @@ FROM debian:bullseye-slim
# ```
# . /etc/os-release && echo "${VERSION_CODENAME}"
# ```
ARG DEBIAN_VERSION_CODENAME=bullseye
ARG DEBIAN_VERSION_CODENAME=bookworm
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
@@ -16,45 +16,47 @@ SHELL ["/bin/bash", "-c"]
RUN set -e \
&& apt update \
&& apt install -y \
autoconf \
automake \
bison \
build-essential \
ca-certificates \
cmake \
curl \
flex \
git \
gnupg \
gzip \
jq \
libcurl4-openssl-dev \
libbz2-dev \
libffi-dev \
liblzma-dev \
libncurses5-dev \
libncursesw5-dev \
libreadline-dev \
libseccomp-dev \
libsqlite3-dev \
libssl-dev \
libstdc++-10-dev \
libtool \
libxml2-dev \
libxmlsec1-dev \
libxxhash-dev \
lsof \
make \
netcat \
net-tools \
openssh-client \
parallel \
pkg-config \
unzip \
wget \
xz-utils \
zlib1g-dev \
zstd \
autoconf \
automake \
bison \
build-essential \
ca-certificates \
cmake \
curl \
flex \
git \
gnupg \
gzip \
jq \
libcurl4-openssl-dev \
libbz2-dev \
libffi-dev \
liblzma-dev \
libncurses5-dev \
libncursesw5-dev \
libpq-dev \
libreadline-dev \
libseccomp-dev \
libsqlite3-dev \
libssl-dev \
libstdc++-11-dev \
libtool \
libxml2-dev \
libxmlsec1-dev \
libxxhash-dev \
lsof \
make \
netcat-traditional \
net-tools \
openssh-client \
parallel \
pkg-config \
unzip \
wget \
xz-utils \
zlib1g-dev \
zstd \
software-properties-common \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# protobuf-compiler (protoc)
@@ -74,15 +76,18 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
# LLVM
ENV LLVM_VERSION=18
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& echo "deb http://apt.llvm.org/bookworm/ llvm-toolchain-bookworm-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install docker
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
# PostgreSQL 14
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
&& apt install -y apt-transport-https ca-certificates curl gnupg-agent lsb-release \
&& curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor > /usr/share/keyrings/docker-ce-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-ce-archive-keyring.gpg] https://download.docker.com/linux/debian $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker-ce.list \
&& apt update \
&& apt install -y docker-ce docker-ce-cli \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

View File

@@ -9,7 +9,7 @@ ARG BUILD_TAG
# Layer "build-deps"
#
#########################################################################################
FROM debian:bullseye-slim AS build-deps
FROM debian:bookworm-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
@@ -93,13 +93,9 @@ RUN apt update && \
protobuf-c-compiler xsltproc
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
RUN case "${PG_VERSION}" in "v17") \
mkdir -p /sfcgal && \
echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
esac && \
wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.4.1/SFCGAL-v1.4.1.tar.gz -O SFCGAL.tar.gz && \
echo "1800c8a26241588f11cddcf433049e9b9aea902e923414d2ecef33a3295626c3 SFCGAL.tar.gz" | sha256sum --check && \
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* /
@@ -161,30 +157,24 @@ FROM build-deps AS plv8-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
ENV PLV8_BRANCH=r3.2
ENV PLV8_VERSION=3.2.2
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
# generate and copy upgrade scripts
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
cp upgrade/* /usr/local/pgsql/share/extension/ && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
RUN apt update && \
apt install -y ninja-build python3-dev git libncurses5 binutils clang libstdc++-12-dev libtinfo5 pkg-config cmake
RUN set -ex \
&& export PATH="/usr/local/pgsql/bin:$PATH" \
&& git clone --branch ${PLV8_BRANCH} --single-branch --depth 1 https://github.com/plv8/plv8 \
&& cd plv8 \
&& make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install \
&& find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip \
&& cd .. && rm -rf ./plv8 \
# don't break computes with installed old version of plv8
cd /usr/local/pgsql/lib/ && \
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
&& ln -s /usr/local/pgsql/lib/plv8-${PLV8_VERSION}.so /usr/local/pgsql/lib/plv8-3.1.5.so \
&& ln -s /usr/local/pgsql/lib/plv8-${PLV8_VERSION}.so /usr/local/pgsql/lib/plv8-3.1.8.so \
&& echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -1027,7 +1017,7 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
#
#########################################################################################
FROM debian:bullseye-slim AS compute-tools-image
FROM debian:bookworm-slim AS compute-tools-image
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
@@ -1144,7 +1134,7 @@ ENV PGDATABASE=postgres
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
FROM debian:bookworm-slim
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
@@ -1165,7 +1155,7 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
# Install:
# libreadline8 for psql
# libicu67, locales for collations (including ICU and plpgsql_check)
# libicu72, locales for collations (including ICU and plpgsql_check)
# liblz4-1 for lz4
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
@@ -1176,7 +1166,7 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
RUN apt update && \
apt install --no-install-recommends -y \
gdb \
libicu67 \
libicu72 \
liblz4-1 \
libreadline8 \
libboost-iostreams1.74.0 \
@@ -1185,8 +1175,8 @@ RUN apt update && \
libboost-system1.74.0 \
libossp-uuid16 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libgdal32 \
libproj25 \
libprotobuf-c1 \
libsfcgal1 \
libxml2 \

View File

@@ -11,6 +11,7 @@ testing = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true
chrono.workspace = true
cfg-if.workspace = true
clap.workspace = true
@@ -23,6 +24,7 @@ num_cpus.workspace = true
opentelemetry.workspace = true
postgres.workspace = true
regex.workspace = true
serde.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
tar.workspace = true
@@ -41,6 +43,7 @@ url.workspace = true
compute_api.workspace = true
utils.workspace = true
workspace_hack.workspace = true
toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
zstd = "0.13"

View File

@@ -9,10 +9,13 @@ anyhow.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
git-version.workspace = true
humantime.workspace = true
nix.workspace = true
once_cell.workspace = true
postgres.workspace = true
hex.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
regex.workspace = true
@@ -20,6 +23,8 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true
tar.workspace = true
thiserror.workspace = true
toml.workspace = true
toml_edit.workspace = true

View File

@@ -151,7 +151,7 @@ where
print!(".");
io::stdout().flush().unwrap();
}
tokio::time::sleep(RETRY_INTERVAL).await;
thread::sleep(RETRY_INTERVAL);
}
Err(e) => {
println!("error starting process {process_name:?}: {e:#}");

View File

@@ -34,14 +34,12 @@ use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
};
use std::borrow::Cow;
use std::collections::{BTreeSet, HashMap};
use std::path::PathBuf;
use std::process::exit;
use std::str::FromStr;
use std::time::Duration;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use tokio::task::JoinSet;
use url::Host;
use utils::{
auth::{Claims, Scope},
@@ -89,35 +87,34 @@ fn main() -> Result<()> {
// Check for 'neon init' command first.
let subcommand_result = if sub_name == "init" {
handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
handle_init(sub_args).map(Some)
} else {
// all other commands need an existing config
let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
let mut env =
LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
let original_env = env.clone();
let env = Box::leak(Box::new(env));
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, env)),
"timeline" => rt.block_on(handle_timeline(sub_args, env)),
"start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
"stop" => rt.block_on(handle_stop_all(sub_args, env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
"storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
"endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
"mappings" => handle_mappings(sub_args, env),
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
"mappings" => handle_mappings(sub_args, &mut env),
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
_ => bail!("unexpected subcommand {sub_name}"),
};
if &original_env != env {
subcommand_result.map(|()| Some(Cow::Borrowed(env)))
if original_env != env {
subcommand_result.map(|()| Some(env))
} else {
subcommand_result.map(|()| None)
}
@@ -1248,122 +1245,49 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match sub_match.subcommand() {
Some(broker_command_data) => broker_command_data,
None => bail!("no broker subcommand provided"),
};
match sub_name {
"start" => {
if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
eprintln!("broker start failed: {e}");
exit(1);
}
}
"stop" => {
if let Err(e) = broker::stop_broker_process(env) {
eprintln!("broker stop failed: {e}");
exit(1);
}
}
_ => bail!("Unexpected broker subcommand '{}'", sub_name),
}
Ok(())
}
async fn handle_start_all(
env: &'static local_env::LocalEnv,
env: &local_env::LocalEnv,
retry_timeout: &Duration,
) -> anyhow::Result<()> {
let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
neon_start_status_check(env, retry_timeout)
.await
.context("status check after successful startup of all services")?;
return Ok(());
};
eprintln!("startup failed because one or more services could not be started");
for e in errors {
eprintln!("{e}");
let debug_repr = format!("{e:?}");
for line in debug_repr.lines() {
eprintln!(" {line}");
}
}
try_stop_all(env, true).await;
exit(2);
}
/// Returns Ok() if and only if all services could be started successfully.
/// Otherwise, returns the list of errors that occurred during startup.
async fn handle_start_all_impl(
env: &'static local_env::LocalEnv,
retry_timeout: Duration,
) -> Result<(), Vec<anyhow::Error>> {
// Endpoints are not started automatically
let mut js = JoinSet::new();
broker::start_broker_process(env, retry_timeout).await?;
// force infalliblity through closure
#[allow(clippy::redundant_closure_call)]
(|| {
js.spawn(async move {
let retry_timeout = retry_timeout;
broker::start_broker_process(env, &retry_timeout).await
});
// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
js.spawn(async move {
let storage_controller = StorageController::from_env(env);
storage_controller
.start(NeonStorageControllerStartArgs::with_default_instance_id(
retry_timeout.into(),
))
.await
.map_err(|e| e.context("start storage_controller"))
});
}
for ps_conf in &env.pageservers {
js.spawn(async move {
let pageserver = PageServerNode::from_env(env, ps_conf);
pageserver
.start(&retry_timeout)
.await
.map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
});
}
for node in env.safekeepers.iter() {
js.spawn(async move {
let safekeeper = SafekeeperNode::from_env(env, node);
safekeeper
.start(vec![], &retry_timeout)
.await
.map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
});
}
})();
let mut errors = Vec::new();
while let Some(result) = js.join_next().await {
let result = result.expect("we don't panic or cancel the tasks");
if let Err(e) = result {
errors.push(e);
// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
if let Err(e) = storage_controller
.start(NeonStorageControllerStartArgs::with_default_instance_id(
(*retry_timeout).into(),
))
.await
{
eprintln!("storage_controller start failed: {:#}", e);
try_stop_all(env, true).await;
exit(1);
}
}
if !errors.is_empty() {
return Err(errors);
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start(retry_timeout).await {
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
exit(1);
}
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false).await;
exit(1);
}
}
neon_start_status_check(env, retry_timeout).await?;
Ok(())
}
@@ -1748,19 +1672,6 @@ fn cli() -> Command {
.arg(stop_mode_arg.clone())
.arg(instance_id))
)
.subcommand(
Command::new("storage_broker")
.arg_required_else_help(true)
.about("Manage broker")
.subcommand(Command::new("start")
.about("Start broker")
.arg(timeout_arg.clone())
)
.subcommand(Command::new("stop")
.about("Stop broker")
.arg(stop_mode_arg.clone())
)
)
.subcommand(
Command::new("safekeeper")
.arg_required_else_help(true)

View File

@@ -702,7 +702,7 @@ impl Endpoint {
}
}
}
tokio::time::sleep(ATTEMPT_INTERVAL).await;
std::thread::sleep(ATTEMPT_INTERVAL);
}
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)

View File

@@ -17,7 +17,9 @@ use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
use pageserver_api::models::{
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
@@ -322,6 +324,22 @@ impl PageServerNode {
background_process::stop_process(immediate, "pageserver", &self.pid_file())
}
pub async fn page_server_psql_client(
&self,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
let mut config = self.pg_connection_config.clone();
if self.conf.pg_auth_type == AuthType::NeonJWT {
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
config = config.set_password(Some(token));
}
Ok(config.connect_no_tls().await?)
}
pub async fn check_status(&self) -> mgmt_api::Result<()> {
self.http_client.status().await
}
@@ -522,6 +540,19 @@ impl PageServerNode {
Ok(())
}
pub async fn location_config(
&self,
tenant_shard_id: TenantShardId,
config: LocationConfig,
flush_ms: Option<Duration>,
lazy: bool,
) -> anyhow::Result<()> {
Ok(self
.http_client
.location_config(tenant_shard_id, config, flush_ms, lazy)
.await?)
}
pub async fn timeline_list(
&self,
tenant_shard_id: &TenantShardId,
@@ -605,4 +636,14 @@ impl PageServerNode {
Ok(())
}
pub async fn tenant_synthetic_size(
&self,
tenant_shard_id: TenantShardId,
) -> anyhow::Result<TenantHistorySize> {
Ok(self
.http_client
.tenant_synthetic_size(tenant_shard_id)
.await?)
}
}

View File

@@ -4,10 +4,13 @@
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
/// enough to extract a few settings we need in Neon, assuming you don't do
/// funny stuff like include-directives or funny escaping.
use anyhow::{bail, Context, Result};
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::HashMap;
use std::fmt;
use std::io::BufRead;
use std::str::FromStr;
/// In-memory representation of a postgresql.conf file
#[derive(Default, Debug)]
@@ -16,16 +19,84 @@ pub struct PostgresConf {
hash: HashMap<String, String>,
}
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
impl PostgresConf {
pub fn new() -> PostgresConf {
PostgresConf::default()
}
/// Read file into memory
pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
let mut result = Self::new();
for line in std::io::BufReader::new(read).lines() {
let line = line?;
// Store each line in a vector, in original format
result.lines.push(line.clone());
// Also parse each line and insert key=value lines into a hash map.
//
// FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
// But it's close enough for our usage.
let line = line.trim();
if line.starts_with('#') {
// comment, ignore
continue;
} else if let Some(caps) = CONF_LINE_RE.captures(line) {
let name = caps.get(1).unwrap().as_str();
let raw_val = caps.get(2).unwrap().as_str();
if let Ok(val) = deescape_str(raw_val) {
// Note: if there's already an entry in the hash map for
// this key, this will replace it. That's the behavior what
// we want; when PostgreSQL reads the file, each line
// overrides any previous value for the same setting.
result.hash.insert(name.to_string(), val.to_string());
}
}
}
Ok(result)
}
/// Return the current value of 'option'
pub fn get(&self, option: &str) -> Option<&str> {
self.hash.get(option).map(|x| x.as_ref())
}
/// Return the current value of a field, parsed to the right datatype.
///
/// This calls the FromStr::parse() function on the value of the field. If
/// the field does not exist, or parsing fails, returns an error.
///
pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
where
T: FromStr,
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
{
self.get(field_name)
.with_context(|| format!("could not find '{}' option {}", field_name, context))?
.parse::<T>()
.with_context(|| format!("could not parse '{}' option {}", field_name, context))
}
pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
where
T: FromStr,
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
{
if let Some(val) = self.get(field_name) {
let result = val
.parse::<T>()
.with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
Ok(Some(result))
} else {
Ok(None)
}
}
///
/// Note: if you call this multiple times for the same option, the config
/// file will a line for each call. It would be nice to have a function
@@ -83,8 +154,48 @@ fn escape_str(s: &str) -> String {
}
}
/// De-escape a possibly-quoted value.
///
/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
/// does this.
fn deescape_str(s: &str) -> Result<String> {
// If the string has a quote at the beginning and end, strip them out.
if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
let mut result = String::new();
let mut iter = s[1..(s.len() - 1)].chars().peekable();
while let Some(c) = iter.next() {
let newc = if c == '\\' {
match iter.next() {
Some('b') => '\x08',
Some('f') => '\x0c',
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('0'..='7') => {
// TODO
bail!("octal escapes not supported");
}
Some(n) => n,
None => break,
}
} else if c == '\'' && iter.peek() == Some(&'\'') {
// doubled quote becomes just one quote
iter.next().unwrap()
} else {
c
};
result.push(newc);
}
Ok(result)
} else {
Ok(s.to_string())
}
}
#[test]
fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
fn test_postgresql_conf_escapes() -> Result<()> {
assert_eq!(escape_str("foo bar"), "'foo bar'");
// these don't need to be quoted
assert_eq!(escape_str("foo"), "foo");
@@ -103,5 +214,13 @@ fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
assert_eq!(escape_str("10 cats"), "'10 cats'");
// Test de-escaping
assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
// octal-escapes are currently not supported
assert!(deescape_str("'foo\\7\\07\\007'").is_err());
Ok(())
}

View File

@@ -11,11 +11,14 @@ clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
humantime.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
storage_controller_client.workspace = true
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true
utils.workspace = true

View File

@@ -8,6 +8,7 @@ license.workspace = true
anyhow.workspace = true
chrono.workspace = true
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
regex.workspace = true

View File

@@ -5,6 +5,9 @@ edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow.workspace = true
chrono = { workspace = true, features = ["serde"] }
rand.workspace = true
serde.workspace = true
serde_with.workspace = true
utils.workspace = true

View File

@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
use rand::Rng;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")]
pub enum EventType {
#[serde(rename = "absolute")]
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
// Just a wrapper around a slice of events
// to serialize it as `{"events" : [ ] }
#[derive(serde::Serialize, Deserialize)]
#[derive(serde::Serialize, serde::Deserialize)]
pub struct EventChunk<'a, T: Clone> {
pub events: std::borrow::Cow<'a, [T]>,
}

View File

@@ -12,4 +12,5 @@ bytes.workspace = true
utils.workspace = true
parking_lot.workspace = true
hex.workspace = true
scopeguard.workspace = true
smallvec = { workspace = true, features = ["write"] }

View File

@@ -104,6 +104,9 @@ pub struct ConfigToml {
pub image_compression: ImageCompressionAlgorithm,
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: Option<crate::models::L0FlushConfig>,
#[serde(skip_serializing)]
// TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
pub io_buffer_alignment: usize,
}
@@ -170,6 +173,40 @@ impl Default for EvictionOrder {
}
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
pub enum GetVectoredImpl {
Sequential,
Vectored,
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
pub enum GetImpl {
Legacy,
Vectored,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
@@ -301,6 +338,8 @@ pub mod defaults {
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
ImageCompressionAlgorithm::Zstd { level: Some(1) };
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
@@ -337,10 +376,7 @@ impl Default for ConfigToml {
concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
.expect("Invalid default constant")),
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
)
.unwrap(),
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
metric_collection_interval: (humantime::parse_duration(
DEFAULT_METRIC_COLLECTION_INTERVAL,
)
@@ -381,6 +417,7 @@ impl Default for ConfigToml {
image_compression: (DEFAULT_IMAGE_COMPRESSION),
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: None,
compact_level0_phase1_value_access: Default::default(),
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
@@ -430,6 +467,8 @@ pub mod tenant_conf_defaults {
// By default ingest enough WAL for two new L0 layers before checking if new image
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
impl Default for TenantConfigToml {

View File

@@ -495,7 +495,7 @@ pub struct CompactionAlgorithmSettings {
pub kind: CompactionAlgorithm,
}
#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
#[serde(rename_all = "snake_case")]

View File

@@ -5,8 +5,10 @@ edition.workspace = true
license.workspace = true
[dependencies]
async-trait.workspace = true
anyhow.workspace = true
bytes.workspace = true
futures.workspace = true
rustls.workspace = true
serde.workspace = true
thiserror.workspace = true

View File

@@ -280,6 +280,16 @@ pub struct PostgresBackend<IO> {
pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
let mut query_string = query_string.to_vec();
if let Some(ch) = query_string.last() {
if *ch == 0 {
query_string.pop();
}
}
query_string
}
/// Cast a byte slice to a string slice, dropping null terminator if there's one.
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);

View File

@@ -5,10 +5,13 @@ edition.workspace = true
license.workspace = true
[dependencies]
rand.workspace = true
regex.workspace = true
bytes.workspace = true
byteorder.workspace = true
anyhow.workspace = true
crc32c.workspace = true
hex.workspace = true
once_cell.workspace = true
log.workspace = true
memoffset.workspace = true

View File

@@ -9,8 +9,8 @@
//! comments on them.
//!
use crate::PageHeaderData;
use crate::BLCKSZ;
use crate::{PageHeaderData, XLogRecord};
//
// From pg_tablespace_d.h
@@ -194,6 +194,8 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
//
// from xlogrecord.h
//
@@ -217,6 +219,8 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
/* From transam.h */
pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
pub const INVALID_TRANSACTION_ID: u32 = 0;
pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
/* pg_control.h */
pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;

View File

@@ -26,7 +26,6 @@ use bytes::{Buf, Bytes};
use log::*;
use serde::Serialize;
use std::ffi::OsStr;
use std::fs::File;
use std::io::prelude::*;
use std::io::ErrorKind;
@@ -79,34 +78,19 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
)
}
pub fn XLogFromFileName(
fname: &OsStr,
wal_seg_size: usize,
) -> anyhow::Result<(XLogSegNo, TimeLineID)> {
if let Some(fname_str) = fname.to_str() {
let tli = u32::from_str_radix(&fname_str[0..8], 16)?;
let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo;
let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo;
Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli))
} else {
anyhow::bail!("non-ut8 filename: {:?}", fname);
}
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
}
pub fn IsXLogFileName(fname: &OsStr) -> bool {
if let Some(fname) = fname.to_str() {
fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit())
} else {
false
}
pub fn IsXLogFileName(fname: &str) -> bool {
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
}
pub fn IsPartialXLogFileName(fname: &OsStr) -> bool {
if let Some(fname) = fname.to_str() {
fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8]))
} else {
false
}
pub fn IsPartialXLogFileName(fname: &str) -> bool {
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
}
/// If LSN points to the beginning of the page, then shift it to first record,

View File

@@ -9,6 +9,7 @@ anyhow.workspace = true
clap.workspace = true
env_logger.workspace = true
log.workspace = true
once_cell.workspace = true
postgres.workspace = true
postgres_ffi.workspace = true
camino-tempfile.workspace = true

View File

@@ -7,7 +7,6 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
use postgres_ffi::{
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration, Instant};
@@ -27,6 +26,7 @@ macro_rules! xlog_utils_test {
postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Conf {
pub pg_version: u32,
pub pg_distrib_dir: PathBuf,
@@ -136,8 +136,8 @@ impl Conf {
pub fn pg_waldump(
&self,
first_segment_name: &OsStr,
last_segment_name: &OsStr,
first_segment_name: &str,
last_segment_name: &str,
) -> anyhow::Result<std::process::Output> {
let first_segment_file = self.datadir.join(first_segment_name);
let last_segment_file = self.datadir.join(last_segment_name);

View File

@@ -4,7 +4,6 @@ use super::*;
use crate::{error, info};
use regex::Regex;
use std::cmp::min;
use std::ffi::OsStr;
use std::fs::{self, File};
use std::io::Write;
use std::{env, str::FromStr};
@@ -55,7 +54,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name())
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
@@ -71,11 +70,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name();
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap();
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
@@ -94,10 +93,10 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
}
}
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump(OsStr::new("000000010000000000000001"), last_segment)
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
@@ -118,7 +117,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
fn check_end_of_wal(
cfg: &crate::Conf,
last_segment: &OsStr,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
@@ -133,8 +132,7 @@ fn check_end_of_wal(
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir()
.join(format!("{}.partial", last_segment.to_str().unwrap())),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
@@ -144,8 +142,7 @@ fn check_end_of_wal(
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir()
.join(format!("{}.partial", last_segment.to_str().unwrap())),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();

View File

@@ -8,8 +8,10 @@ license.workspace = true
bytes.workspace = true
byteorder.workspace = true
itertools.workspace = true
pin-project-lite.workspace = true
postgres-protocol.workspace = true
rand.workspace = true
tokio = { workspace = true, features = ["io-util"] }
tracing.workspace = true
thiserror.workspace = true
serde.workspace = true

View File

@@ -13,11 +13,14 @@ aws-smithy-async.workspace = true
aws-smithy-types.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
bytes.workspace = true
camino = { workspace = true, features = ["serde1"] }
humantime.workspace = true
humantime-serde.workspace = true
hyper = { workspace = true, features = ["stream"] }
futures.workspace = true
rand.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }

View File

@@ -127,6 +127,10 @@ impl RemotePath {
&self.0
}
pub fn extension(&self) -> Option<&str> {
self.0.extension()
}
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0)
}

View File

@@ -6,5 +6,6 @@ license.workspace = true
[dependencies]
serde.workspace = true
serde_with.workspace = true
const_format.workspace = true
utils.workspace = true

View File

@@ -9,9 +9,8 @@ hyper.workspace = true
opentelemetry = { workspace = true, features=["rt-tokio"] }
opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions.workspace = true
reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tracing.workspace = true
tracing-opentelemetry.workspace = true
[dev-dependencies]
tracing-subscriber.workspace = true # For examples in docs
tracing-subscriber.workspace = true

View File

@@ -42,6 +42,7 @@ tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
rand.workspace = true
serde_with.workspace = true
strum.workspace = true
strum_macros.workspace = true
url.workspace = true

View File

@@ -82,7 +82,7 @@ impl ApiError {
StatusCode::INTERNAL_SERVER_ERROR,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
}

View File

@@ -21,13 +21,7 @@
//!
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
use std::{
sync::{
atomic::{AtomicU64, Ordering},
Mutex,
},
time::Duration,
};
use std::{sync::Mutex, time::Duration};
use tokio::{sync::Notify, time::Instant};
@@ -134,7 +128,6 @@ impl LeakyBucketState {
pub struct RateLimiter {
pub config: LeakyBucketConfig,
pub sleep_counter: AtomicU64,
pub state: Mutex<LeakyBucketState>,
/// a queue to provide this fair ordering.
pub queue: Notify,
@@ -151,7 +144,6 @@ impl Drop for Requeue<'_> {
impl RateLimiter {
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
RateLimiter {
sleep_counter: AtomicU64::new(0),
state: Mutex::new(LeakyBucketState::with_initial_tokens(
&config,
initial_tokens,
@@ -171,16 +163,15 @@ impl RateLimiter {
/// returns true if we did throttle
pub async fn acquire(&self, count: usize) -> bool {
let start = tokio::time::Instant::now();
let mut throttled = false;
let start_count = self.sleep_counter.load(Ordering::Acquire);
let mut end_count = start_count;
let start = tokio::time::Instant::now();
// wait until we are the first in the queue
let mut notified = std::pin::pin!(self.queue.notified());
if !notified.as_mut().enable() {
throttled = true;
notified.await;
end_count = self.sleep_counter.load(Ordering::Acquire);
}
// notify the next waiter in the queue when we are done.
@@ -193,22 +184,9 @@ impl RateLimiter {
.unwrap()
.add_tokens(&self.config, start, count as f64);
match res {
Ok(()) => return end_count > start_count,
Ok(()) => return throttled,
Err(ready_at) => {
struct Increment<'a>(&'a AtomicU64);
impl Drop for Increment<'_> {
fn drop(&mut self) {
self.0.fetch_add(1, Ordering::AcqRel);
}
}
// increment the counter after we finish sleeping (or cancel this task).
// this ensures that tasks that have already started the acquire will observe
// the new sleep count when they are allowed to resume on the notify.
let _inc = Increment(&self.sleep_counter);
end_count += 1;
throttled = true;
tokio::time::sleep_until(ready_at).await;
}
}

View File

@@ -120,6 +120,32 @@ impl<K: Ord, V> VecMap<K, V> {
Ok((None, delta_size))
}
/// Split the map into two.
///
/// The left map contains everything before `cutoff` (exclusive).
/// Right map contains `cutoff` and everything after (inclusive).
pub fn split_at(&self, cutoff: &K) -> (Self, Self)
where
K: Clone,
V: Clone,
{
let split_idx = self
.data
.binary_search_by_key(&cutoff, extract_key)
.unwrap_or_else(std::convert::identity);
(
VecMap {
data: self.data[..split_idx].to_vec(),
ordering: self.ordering,
},
VecMap {
data: self.data[split_idx..].to_vec(),
ordering: self.ordering,
},
)
}
/// Move items from `other` to the end of `self`, leaving `other` empty.
/// If the `other` ordering is different from `self` ordering
/// `ExtendOrderingError` error will be returned.

View File

@@ -15,11 +15,13 @@ anyhow.workspace = true
axum.workspace = true
clap.workspace = true
futures.workspace = true
inotify.workspace = true
serde.workspace = true
serde_json.workspace = true
sysinfo.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true

View File

@@ -15,6 +15,7 @@ anyhow.workspace = true
arc-swap.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
bit_field.workspace = true
byteorder.workspace = true
bytes.workspace = true
@@ -22,9 +23,12 @@ camino.workspace = true
camino-tempfile.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
const_format.workspace = true
consumption_metrics.workspace = true
crc32c.workspace = true
crossbeam-utils.workspace = true
either.workspace = true
flate2.workspace = true
fail.workspace = true
futures.workspace = true
git-version.workspace = true
@@ -53,6 +57,10 @@ serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
serde_path_to_error.workspace = true
serde_with.workspace = true
signal-hook.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
sysinfo.workspace = true
tokio-tar.workspace = true
thiserror.workspace = true
@@ -65,6 +73,7 @@ tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
tracing.workspace = true
twox-hash.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true

View File

@@ -1,7 +1,7 @@
//! Quantify a single walredo manager's throughput under N concurrent callers.
//!
//! The benchmark implementation ([`bench_impl`]) is parametrized by
//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
//! - `nclients` => number of clients (more on this shortly).
//!
@@ -10,7 +10,7 @@
//! Each task executes the `redo_work` `n_redos/nclients` times.
//!
//! We exercise the following combinations:
//! - `redo_work = ping / short / medium``
//! - `redo_work = short / medium``
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
//!
//! We let `criterion` determine the `n_redos` using `iter_custom`.
@@ -27,43 +27,33 @@
//!
//! # Reference Numbers
//!
//! 2024-09-18 on im4gn.2xlarge
//! 2024-04-15 on i3en.3xlarge
//!
//! ```text
//! ping/1 time: [21.789 µs 21.918 µs 22.078 µs]
//! ping/2 time: [27.686 µs 27.812 µs 27.970 µs]
//! ping/4 time: [35.468 µs 35.671 µs 35.926 µs]
//! ping/8 time: [59.682 µs 59.987 µs 60.363 µs]
//! ping/16 time: [101.79 µs 102.37 µs 103.08 µs]
//! ping/32 time: [184.18 µs 185.15 µs 186.36 µs]
//! ping/64 time: [349.86 µs 351.45 µs 353.47 µs]
//! ping/128 time: [684.53 µs 687.98 µs 692.17 µs]
//! short/1 time: [31.833 µs 32.126 µs 32.428 µs]
//! short/2 time: [35.558 µs 35.756 µs 35.992 µs]
//! short/4 time: [44.850 µs 45.138 µs 45.484 µs]
//! short/8 time: [65.985 µs 66.379 µs 66.853 µs]
//! short/16 time: [127.06 µs 127.90 µs 128.87 µs]
//! short/32 time: [252.98 µs 254.70 µs 256.73 µs]
//! short/64 time: [497.13 µs 499.86 µs 503.26 µs]
//! short/128 time: [987.46 µs 993.45 µs 1.0004 ms]
//! medium/1 time: [137.91 µs 138.55 µs 139.35 µs]
//! medium/2 time: [192.00 µs 192.91 µs 194.07 µs]
//! medium/4 time: [389.62 µs 391.55 µs 394.01 µs]
//! medium/8 time: [776.80 µs 780.33 µs 784.77 µs]
//! medium/16 time: [1.5323 ms 1.5383 ms 1.5459 ms]
//! medium/32 time: [3.0120 ms 3.0226 ms 3.0350 ms]
//! medium/64 time: [5.7405 ms 5.7787 ms 5.8166 ms]
//! medium/128 time: [10.412 ms 10.574 ms 10.718 ms]
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
//! ```
use anyhow::Context;
use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion};
use once_cell::sync::Lazy;
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
use pageserver_api::{key::Key, shard::TenantShardId};
use std::{
future::Future,
sync::Arc,
time::{Duration, Instant},
};
@@ -71,59 +61,40 @@ use tokio::{sync::Barrier, task::JoinSet};
use utils::{id::TenantId, lsn::Lsn};
fn bench(c: &mut Criterion) {
macro_rules! bench_group {
($name:expr, $redo_work:expr) => {{
let name: &str = $name;
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group(name);
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
},
);
}
}};
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("short");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
}
}
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("medium");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
}
}
//
// benchmark the protocol implementation
//
let pg_version = 14;
bench_group!(
"ping",
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
let _: () = mgr.ping(pg_version).await.unwrap();
})
);
//
// benchmarks with actual record redo
//
let make_redo_work = |req: &'static Request| {
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
let page = req.execute(&mgr).await.unwrap();
assert_eq!(page.remaining(), 8192);
})
};
bench_group!("short", {
static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
make_redo_work(&REQUEST)
});
bench_group!("medium", {
static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
make_redo_work(&REQUEST)
});
}
criterion::criterion_group!(benches, bench);
criterion::criterion_main!(benches);
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
where
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
Fut: Future<Output = ()> + Send + 'static,
{
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
@@ -164,20 +135,17 @@ where
})
}
async fn client<F, Fut>(
async fn client(
mgr: Arc<PostgresRedoManager>,
start: Arc<Barrier>,
redo_work: Arc<F>,
redo_work: Arc<Request>,
n_redos: u64,
) -> Duration
where
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
Fut: Future<Output = ()> + Send + 'static,
{
) -> Duration {
start.wait().await;
let start = Instant::now();
for _ in 0..n_redos {
redo_work(Arc::clone(&mgr)).await;
let page = redo_work.execute(&mgr).await.unwrap();
assert_eq!(page.remaining(), 8192);
// The real pageserver will rarely if ever do 2 walredos in a row without
// yielding to the executor.
tokio::task::yield_now().await;

View File

@@ -432,7 +432,7 @@ impl Client {
self.mgmt_api_endpoint
);
self.request(Method::PUT, &uri, req)
self.request(Method::POST, &uri, req)
.await?
.json()
.await

View File

@@ -9,19 +9,41 @@ default = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true
byteorder.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
const_format.workspace = true
consumption_metrics.workspace = true
crossbeam-utils.workspace = true
either.workspace = true
flate2.workspace = true
fail.workspace = true
futures.workspace = true
git-version.workspace = true
hex.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
itertools.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pin-project-lite.workspace = true
rand.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
tokio-io-timeout.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -8,6 +8,7 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
bytes.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
@@ -23,4 +24,5 @@ toml_edit.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -13,6 +13,7 @@ use pageserver_api::{
use remote_storage::{RemotePath, RemoteStorageConfig};
use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::logging::SecretString;
use once_cell::sync::OnceCell;
@@ -32,7 +33,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use crate::virtual_file;
use crate::virtual_file::io_engine;
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
/// Global state of pageserver.
///
@@ -256,6 +257,17 @@ impl PageServerConf {
.join(timeline_id.to_string())
}
pub(crate) fn timeline_delete_mark_file_path(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_shard_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
/// Turns storage remote path of a file into its local path.
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
remote_path.with_base(&self.workdir)
@@ -324,6 +336,7 @@ impl PageServerConf {
max_vectored_read_bytes,
image_compression,
ephemeral_bytes_per_memory_kb,
compact_level0_phase1_value_access: _,
l0_flush,
virtual_file_direct_io,
concurrent_tenant_warmup,
@@ -478,6 +491,11 @@ pub struct ConfigurableSemaphore {
}
impl ConfigurableSemaphore {
pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
Some(x) => x,
None => panic!("const unwrap is not yet stable"),
};
/// Initializse using a non-zero amount of permits.
///
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
@@ -498,6 +516,12 @@ impl ConfigurableSemaphore {
}
}
impl Default for ConfigurableSemaphore {
fn default() -> Self {
Self::new(Self::DEFAULT_INITIAL)
}
}
impl PartialEq for ConfigurableSemaphore {
fn eq(&self, other: &Self) -> bool {
// the number of permits can be increased at runtime, so we cannot really fulfill the
@@ -534,6 +558,16 @@ mod tests {
.expect("parse_and_validate");
}
#[test]
fn test_compactl0_phase1_access_mode_is_ignored_silently() {
let input = indoc::indoc! {r#"
[compact_level0_phase1_value_access]
mode = "streaming-kmerge"
validate = "key-lsn-value"
"#};
toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
}
/// If there's a typo in the pageserver config, we'd rather catch that typo
/// and fail pageserver startup than silently ignoring the typo, leaving whoever
/// made it in the believe that their config change is effective.

View File

@@ -178,7 +178,7 @@ async fn collect_metrics(
)
.await;
if let Err(e) = res {
tracing::error!("failed to upload to remote storage: {e:#}");
tracing::error!("failed to upload to S3: {e:#}");
}
}
};

View File

@@ -2955,7 +2955,7 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|r| api_handler(r, timeline_preserve_initdb_handler),
)
.put(
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|r| api_handler(r, timeline_archival_config_handler),
)

View File

@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
}
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
global_latency_histo: &'a Histogram,
global_metric: &'a Histogram,
// Optional because not all op types are tracked per-timeline
per_timeline_latency_histo: Option<&'a Histogram>,
timeline_metric: Option<&'a Histogram>,
ctx: &'c RequestContext,
start: std::time::Instant,
@@ -1212,10 +1212,9 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
elapsed
}
};
self.global_latency_histo
.observe(ex_throttled.as_secs_f64());
if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
self.global_metric.observe(ex_throttled.as_secs_f64());
if let Some(timeline_metric) = self.timeline_metric {
timeline_metric.observe(ex_throttled.as_secs_f64());
}
}
}
@@ -1241,32 +1240,10 @@ pub enum SmgrQueryType {
#[derive(Debug)]
pub(crate) struct SmgrQueryTimePerTimeline {
global_started: [IntCounter; SmgrQueryType::COUNT],
global_latency: [Histogram; SmgrQueryType::COUNT],
per_timeline_getpage_started: IntCounter,
per_timeline_getpage_latency: Histogram,
global_metrics: [Histogram; SmgrQueryType::COUNT],
per_timeline_getpage: Histogram,
}
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
"pageserver_smgr_query_started_global_count",
"Number of smgr queries started, aggregated by query type.",
&["smgr_query_type"],
)
.expect("failed to define a metric")
});
static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
"pageserver_smgr_query_started_count",
"Number of smgr queries started, aggregated by query type and tenant/timeline.",
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
)
.expect("failed to define a metric")
});
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_smgr_query_seconds",
@@ -1342,20 +1319,14 @@ impl SmgrQueryTimePerTimeline {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
let timeline_id = timeline_id.to_string();
let global_started = std::array::from_fn(|i| {
let op = SmgrQueryType::from_repr(i).unwrap();
SMGR_QUERY_STARTED_GLOBAL
.get_metric_with_label_values(&[op.into()])
.unwrap()
});
let global_latency = std::array::from_fn(|i| {
let global_metrics = std::array::from_fn(|i| {
let op = SmgrQueryType::from_repr(i).unwrap();
SMGR_QUERY_TIME_GLOBAL
.get_metric_with_label_values(&[op.into()])
.unwrap()
});
let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
&tenant_id,
@@ -1363,20 +1334,9 @@ impl SmgrQueryTimePerTimeline {
&timeline_id,
])
.unwrap();
let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
&tenant_id,
&shard_slug,
&timeline_id,
])
.unwrap();
Self {
global_started,
global_latency,
per_timeline_getpage_latency,
per_timeline_getpage_started,
global_metrics,
per_timeline_getpage,
}
}
pub(crate) fn start_timer<'c: 'a, 'a>(
@@ -1384,11 +1344,8 @@ impl SmgrQueryTimePerTimeline {
op: SmgrQueryType,
ctx: &'c RequestContext,
) -> Option<impl Drop + '_> {
let global_metric = &self.global_metrics[op as usize];
let start = Instant::now();
self.global_started[op as usize].inc();
// We subtract time spent throttled from the observed latency.
match ctx.micros_spent_throttled.open() {
Ok(()) => (),
Err(error) => {
@@ -1407,16 +1364,15 @@ impl SmgrQueryTimePerTimeline {
}
}
let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
self.per_timeline_getpage_started.inc();
Some(&self.per_timeline_getpage_latency)
let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
Some(&self.per_timeline_getpage)
} else {
None
};
Some(GlobalAndPerTimelineHistogramTimer {
global_latency_histo: &self.global_latency[op as usize],
per_timeline_latency_histo,
global_metric,
timeline_metric,
ctx,
start,
op,
@@ -1467,12 +1423,9 @@ mod smgr_query_time_tests {
let get_counts = || {
let global: u64 = ops
.iter()
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
.map(|op| metrics.global_metrics[*op as usize].get_sample_count())
.sum();
(
global,
metrics.per_timeline_getpage_latency.get_sample_count(),
)
(global, metrics.per_timeline_getpage.get_sample_count())
};
let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1824,7 +1777,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
.expect("failed to define a metric"),
upload_heatmap_duration: register_histogram!(
"pageserver_secondary_upload_heatmap_duration",
"Time to build and upload a heatmap, including any waiting inside the remote storage client"
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
download_heatmap: register_int_counter!(
@@ -2623,12 +2576,6 @@ impl TimelineMetrics {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
}
let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
tenant_id,
shard_id,
timeline_id,
]);
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
tenant_id,
@@ -2645,8 +2592,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
}
tenant_throttling::remove_tenant_metrics(tenant_shard_id);
// we leave the BROKEN_TENANTS_SET entry if any
}
@@ -3110,180 +3055,41 @@ pub mod tokio_epoll_uring {
pub(crate) mod tenant_throttling {
use metrics::{register_int_counter_vec, IntCounter};
use once_cell::sync::Lazy;
use utils::shard::TenantShardId;
use crate::tenant::{self, throttle::Metric};
struct GlobalAndPerTenantIntCounter {
global: IntCounter,
per_tenant: IntCounter,
}
impl GlobalAndPerTenantIntCounter {
#[inline(always)]
pub(crate) fn inc(&self) {
self.inc_by(1)
}
#[inline(always)]
pub(crate) fn inc_by(&self, n: u64) {
self.global.inc_by(n);
self.per_tenant.inc_by(n);
}
}
pub(crate) struct TimelineGet {
count_accounted_start: GlobalAndPerTenantIntCounter,
count_accounted_finish: GlobalAndPerTenantIntCounter,
wait_time: GlobalAndPerTenantIntCounter,
count_throttled: GlobalAndPerTenantIntCounter,
wait_time: IntCounter,
count: IntCounter,
}
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_start_global",
"Count of tenant throttling starts, by kind of throttle.",
&["kind"]
)
.unwrap()
});
static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_start",
"Count of tenant throttling starts, by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
});
static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_finish_global",
"Count of tenant throttling finishes, by kind of throttle.",
&["kind"]
)
.unwrap()
});
static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_accounted_finish",
"Count of tenant throttling finishes, by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
});
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_wait_usecs_sum_global",
"Sum of microseconds that spent waiting throttle by kind of throttle.",
"Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
&["kind"]
)
.unwrap()
});
static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_wait_usecs_sum",
"Sum of microseconds that spent waiting throttle by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
.unwrap()
});
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_global",
"Count of tenant throttlings, by kind of throttle.",
&["kind"]
)
.unwrap()
});
let kind = "timeline_get";
TimelineGet {
wait_time: WAIT_USECS.with_label_values(&[kind]),
count: WAIT_COUNT.with_label_values(&[kind]),
}
});
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_global",
"Count of tenant throttlings, by kind of throttle.",
&["kind"]
)
.unwrap()
});
static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count",
"Count of tenant throttlings, by kind of throttle.",
&["kind", "tenant_id", "shard_id"]
)
.unwrap()
});
const KIND: &str = "timeline_get";
impl TimelineGet {
pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
TimelineGet {
count_accounted_start: {
GlobalAndPerTenantIntCounter {
global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
count_accounted_finish: {
GlobalAndPerTenantIntCounter {
global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
wait_time: {
GlobalAndPerTenantIntCounter {
global: WAIT_USECS.with_label_values(&[KIND]),
per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
count_throttled: {
GlobalAndPerTenantIntCounter {
global: WAIT_COUNT.with_label_values(&[KIND]),
per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]),
}
},
}
}
}
pub(crate) fn preinitialize_global_metrics() {
Lazy::force(&COUNT_ACCOUNTED_START);
Lazy::force(&COUNT_ACCOUNTED_FINISH);
Lazy::force(&WAIT_USECS);
Lazy::force(&WAIT_COUNT);
}
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
for m in &[
&COUNT_ACCOUNTED_START_PER_TENANT,
&COUNT_ACCOUNTED_FINISH_PER_TENANT,
&WAIT_USECS_PER_TENANT,
&WAIT_COUNT_PER_TENANT,
] {
let _ = m.remove_label_values(&[
KIND,
&tenant_shard_id.tenant_id.to_string(),
&tenant_shard_id.shard_slug().to_string(),
]);
}
}
impl Metric for TimelineGet {
#[inline(always)]
fn accounting_start(&self) {
self.count_accounted_start.inc();
}
#[inline(always)]
fn accounting_finish(&self) {
self.count_accounted_finish.inc();
}
impl Metric for &'static TimelineGet {
#[inline(always)]
fn observe_throttling(
&self,
@@ -3291,7 +3097,7 @@ pub(crate) mod tenant_throttling {
) {
let val = u64::try_from(wait_time.as_micros()).unwrap();
self.wait_time.inc_by(val);
self.count_throttled.inc();
self.count.inc();
}
}
}
@@ -3421,14 +3227,11 @@ pub fn preinitialize_metrics() {
}
// countervecs
[
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
&SMGR_QUERY_STARTED_GLOBAL,
]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
// gauges
WALRECEIVER_ACTIVE_MANAGERS.get();
@@ -3450,8 +3253,7 @@ pub fn preinitialize_metrics() {
// Custom
Lazy::force(&RECONSTRUCT_TIME);
Lazy::force(&tenant_throttling::TIMELINE_GET);
Lazy::force(&BASEBACKUP_QUERY_TIME);
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
tenant_throttling::preinitialize_global_metrics();
}

View File

@@ -18,6 +18,7 @@ use camino::Utf8Path;
use camino::Utf8PathBuf;
use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::FutureExt;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::AuxFilePolicy;
@@ -33,7 +34,6 @@ use remote_storage::GenericRemoteStorage;
use remote_storage::TimeoutOrCancel;
use std::collections::BTreeMap;
use std::fmt;
use std::future::Future;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
@@ -140,7 +140,6 @@ pub mod metadata;
pub mod remote_timeline_client;
pub mod storage_layer;
pub mod checks;
pub mod config;
pub mod mgr;
pub mod secondary;
@@ -302,7 +301,7 @@ pub struct Tenant {
/// Throttle applied at the top of [`Timeline::get`].
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
pub(crate) timeline_get_throttle:
Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
/// An ongoing timeline detach concurrency limiter.
///
@@ -1031,9 +1030,13 @@ impl Tenant {
}
Ok(TenantPreload {
timelines: self
.load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
.await?,
timelines: Self::load_timeline_metadata(
self,
remote_timeline_ids,
remote_storage,
cancel,
)
.await?,
})
}
@@ -1299,7 +1302,7 @@ impl Tenant {
.await
}
async fn load_timelines_metadata(
async fn load_timeline_metadata(
self: &Arc<Tenant>,
timeline_ids: HashSet<TimelineId>,
remote_storage: &GenericRemoteStorage,
@@ -1307,10 +1310,33 @@ impl Tenant {
) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
let mut part_downloads = JoinSet::new();
for timeline_id in timeline_ids {
let client = RemoteTimelineClient::new(
remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_shard_id,
timeline_id,
self.generation,
);
let cancel_clone = cancel.clone();
part_downloads.spawn(
self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
.instrument(info_span!("download_index_part", %timeline_id)),
async move {
debug!("starting index part download");
let index_part = client.download_index_file(&cancel_clone).await;
debug!("finished index part download");
Result::<_, anyhow::Error>::Ok(TimelinePreload {
client,
timeline_id,
index_part,
})
}
.map(move |res| {
res.with_context(|| format!("download index part for timeline {timeline_id}"))
})
.instrument(info_span!("download_index_part", %timeline_id)),
);
}
@@ -1321,7 +1347,8 @@ impl Tenant {
next = part_downloads.join_next() => {
match next {
Some(result) => {
let preload = result.context("join preload task")?;
let preload_result = result.context("join preload task")?;
let preload = preload_result?;
timeline_preloads.insert(preload.timeline_id, preload);
},
None => {
@@ -1338,36 +1365,6 @@ impl Tenant {
Ok(timeline_preloads)
}
fn load_timeline_metadata(
self: &Arc<Tenant>,
timeline_id: TimelineId,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
) -> impl Future<Output = TimelinePreload> {
let client = RemoteTimelineClient::new(
remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_shard_id,
timeline_id,
self.generation,
);
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
debug!("starting index part download");
let index_part = client.download_index_file(&cancel).await;
debug!("finished index part download");
TimelinePreload {
client,
timeline_id,
index_part,
}
}
}
pub(crate) async fn apply_timeline_archival_config(
&self,
timeline_id: TimelineId,
@@ -1576,9 +1573,6 @@ impl Tenant {
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
use checks::check_valid_layermap;
use itertools::Itertools;
let tline = self
.create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
.await?;
@@ -1593,18 +1587,6 @@ impl Tenant {
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
.await?;
}
let layer_names = tline
.layers
.read()
.await
.layer_map()
.unwrap()
.iter_historic_layers()
.map(|layer| layer.layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!("invalid layermap: {err}");
}
Ok(tline)
}
@@ -2833,7 +2815,7 @@ impl Tenant {
gate: Gate::default(),
timeline_get_throttle: Arc::new(throttle::Throttle::new(
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
&crate::metrics::tenant_throttling::TIMELINE_GET,
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
@@ -3215,9 +3197,6 @@ impl Tenant {
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
use checks::check_valid_layermap;
use itertools::Itertools;
let tline = self
.branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
.await?;
@@ -3238,18 +3217,6 @@ impl Tenant {
.force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
.await?;
}
let layer_names = tline
.layers
.read()
.await
.layer_map()
.unwrap()
.iter_historic_layers()
.map(|layer| layer.layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!("invalid layermap: {err}");
}
Ok(tline)
}
@@ -4197,18 +4164,9 @@ pub(crate) mod harness {
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
if records_neon {
// For Neon wal records, we can decode without spawning postgres, so do so.
let mut page = match (base_img, records.first()) {
(Some((_lsn, img)), _) => {
let mut page = BytesMut::new();
page.extend_from_slice(&img);
page
}
(_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
_ => {
panic!("Neon WAL redo requires base image or will init record");
}
};
let base_img = base_img.expect("Neon WAL redo requires base image").1;
let mut page = BytesMut::new();
page.extend_from_slice(&base_img);
for (record_lsn, record) in records {
apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
}
@@ -8512,135 +8470,4 @@ mod tests {
Ok(())
}
// Regression test for https://github.com/neondatabase/neon/issues/9012
// Create an image arrangement where we have to read at different LSN ranges
// from a delta layer. This is achieved by overlapping an image layer on top of
// a delta layer. Like so:
//
// A B
// +----------------+ -> delta_layer
// | | ^ lsn
// | =========|-> nested_image_layer |
// | C | |
// +----------------+ |
// ======== -> baseline_image_layer +-------> key
//
//
// When querying the key range [A, B) we need to read at different LSN ranges
// for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
#[tokio::test]
async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
let (tenant, ctx) = harness.load().await;
let will_init_keys = [2, 6];
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let mut expected_key_values = HashMap::new();
let baseline_image_layer_lsn = Lsn(0x10);
let mut baseline_img_layer = Vec::new();
for i in 0..5 {
let key = get_key(i);
let value = format!("value {i}@{baseline_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
baseline_img_layer.push((key, Bytes::from(value)));
}
let nested_image_layer_lsn = Lsn(0x50);
let mut nested_img_layer = Vec::new();
for i in 5..10 {
let key = get_key(i);
let value = format!("value {i}@{nested_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
nested_img_layer.push((key, Bytes::from(value)));
}
let mut delta_layer_spec = Vec::default();
let delta_layer_start_lsn = Lsn(0x20);
let mut delta_layer_end_lsn = delta_layer_start_lsn;
for i in 0..10 {
let key = get_key(i);
let key_in_nested = nested_img_layer
.iter()
.any(|(key_with_img, _)| *key_with_img == key);
let lsn = {
if key_in_nested {
Lsn(nested_image_layer_lsn.0 + 0x10)
} else {
delta_layer_start_lsn
}
};
let will_init = will_init_keys.contains(&i);
if will_init {
delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
expected_key_values.insert(key, "".to_string());
} else {
let delta = format!("@{lsn}");
delta_layer_spec.push((
key,
lsn,
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
));
expected_key_values
.get_mut(&key)
.expect("An image exists for each key")
.push_str(delta.as_str());
}
delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
}
delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
assert!(
nested_image_layer_lsn > delta_layer_start_lsn
&& nested_image_layer_lsn < delta_layer_end_lsn
);
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
baseline_image_layer_lsn,
DEFAULT_PG_VERSION,
&ctx,
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
delta_layer_start_lsn..delta_layer_end_lsn,
delta_layer_spec,
)], // delta layers
vec![
(baseline_image_layer_lsn, baseline_img_layer),
(nested_image_layer_lsn, nested_img_layer),
], // image layers
delta_layer_end_lsn,
)
.await?;
let keyspace = KeySpace::single(get_key(0)..get_key(10));
let results = tline
.get_vectored(keyspace, delta_layer_end_lsn, &ctx)
.await
.expect("No vectored errors");
for (key, res) in results {
let value = res.expect("No key errors");
let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
assert_eq!(value, Bytes::from(expected_value));
}
Ok(())
}
}

View File

@@ -1,55 +0,0 @@
use std::collections::BTreeSet;
use itertools::Itertools;
use super::storage_layer::LayerName;
/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
///
/// ```plain
/// | | | |
/// | 1 | | 2 | | 3 |
/// | | | | | |
/// ```
///
/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
/// the same LSN range.
///
/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
///
/// ```plain
/// | | | 2 | | |
/// | 1 | |-------| | 3 |
/// | | | 4 | | |
///
/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
let mut all_delta_layers = Vec::new();
for name in metadata {
if let LayerName::Delta(layer) = name {
if layer.key_range.start.next() != layer.key_range.end {
all_delta_layers.push(layer.clone());
}
}
}
for layer in &all_delta_layers {
let lsn_range = &layer.lsn_range;
lsn_split_point.insert(lsn_range.start);
lsn_split_point.insert(lsn_range.end);
}
for layer in &all_delta_layers {
let lsn_range = layer.lsn_range.clone();
let intersects = lsn_split_point.range(lsn_range).collect_vec();
if intersects.len() > 1 {
let err = format!(
"layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
layer,
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
);
return Some(err);
}
}
None
}

View File

@@ -1,29 +1,11 @@
use std::{collections::HashMap, time::Duration};
use std::collections::HashMap;
use super::remote_timeline_client::index::GcBlockingReason;
use tokio::time::Instant;
use utils::id::TimelineId;
type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
use super::remote_timeline_client::index::GcBlockingReason;
#[derive(Default)]
struct Storage {
timelines_blocked: TimelinesBlocked,
/// The deadline before which we are blocked from GC so that
/// leases have a chance to be renewed.
lsn_lease_deadline: Option<Instant>,
}
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
impl Storage {
fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
self.lsn_lease_deadline
.map(|d| Instant::now() < d)
.unwrap_or(false)
}
}
/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
/// blocking.
#[derive(Default)]
pub(crate) struct GcBlock {
/// The timelines which have current reasons to block gc.
@@ -31,12 +13,6 @@ pub(crate) struct GcBlock {
/// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
/// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
reasons: std::sync::Mutex<Storage>,
/// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
///
/// Do not add any more features taking and forbidding taking this lock. It should be
/// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
/// synchronizes with gc attempts by locking and unlocking this mutex.
blocking: tokio::sync::Mutex<()>,
}
@@ -66,20 +42,6 @@ impl GcBlock {
}
}
/// Sets a deadline before which we cannot proceed to GC due to lsn lease.
///
/// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
/// length, we guarantee that all the leases we granted before will have a chance to renew
/// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
let deadline = Instant::now() + lsn_lease_length;
let mut g = self.reasons.lock().unwrap();
g.lsn_lease_deadline = Some(deadline);
}
/// Describe the current gc blocking reasons.
///
/// TODO: make this json serializable.
pub(crate) fn summary(&self) -> Option<BlockingReasons> {
let g = self.reasons.lock().unwrap();
@@ -102,7 +64,7 @@ impl GcBlock {
) -> anyhow::Result<bool> {
let (added, uploaded) = {
let mut g = self.reasons.lock().unwrap();
let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
let set = g.entry(timeline.timeline_id).or_default();
let added = set.insert(reason);
// LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -133,7 +95,7 @@ impl GcBlock {
let (remaining_blocks, uploaded) = {
let mut g = self.reasons.lock().unwrap();
match g.timelines_blocked.entry(timeline.timeline_id) {
match g.entry(timeline.timeline_id) {
Entry::Occupied(mut oe) => {
let set = oe.get_mut();
set.remove(reason);
@@ -147,7 +109,7 @@ impl GcBlock {
}
}
let remaining_blocks = g.timelines_blocked.len();
let remaining_blocks = g.len();
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
let uploaded = timeline
@@ -172,11 +134,11 @@ impl GcBlock {
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
let unblocked = {
let mut g = self.reasons.lock().unwrap();
if g.timelines_blocked.is_empty() {
if g.is_empty() {
return;
}
g.timelines_blocked.remove(&timeline.timeline_id);
g.remove(&timeline.timeline_id);
BlockingReasons::clean_and_summarize(g).is_none()
};
@@ -187,11 +149,10 @@ impl GcBlock {
}
/// Initialize with the non-deleted timelines of this tenant.
pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
pub(crate) fn set_scanned(&self, scanned: Storage) {
let mut g = self.reasons.lock().unwrap();
assert!(g.timelines_blocked.is_empty());
g.timelines_blocked
.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
assert!(g.is_empty());
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -205,7 +166,6 @@ pub(super) struct Guard<'a> {
#[derive(Debug)]
pub(crate) struct BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: bool,
timelines: usize,
reasons: enumset::EnumSet<GcBlockingReason>,
}
@@ -214,8 +174,8 @@ impl std::fmt::Display for BlockingReasons {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
"{} timelines block for {:?}",
self.timelines, self.reasons
)
}
}
@@ -223,15 +183,13 @@ impl std::fmt::Display for BlockingReasons {
impl BlockingReasons {
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
let mut reasons = enumset::EnumSet::empty();
g.timelines_blocked.retain(|_key, value| {
g.retain(|_key, value| {
reasons = reasons.union(*value);
!value.is_empty()
});
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
if !g.is_empty() {
Some(BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
timelines: g.timelines_blocked.len(),
timelines: g.len(),
reasons,
})
} else {
@@ -240,17 +198,14 @@ impl BlockingReasons {
}
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
if g.is_empty() {
None
} else {
let reasons = g
.timelines_blocked
.values()
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
Some(BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
timelines: g.timelines_blocked.len(),
timelines: g.len(),
reasons,
})
}

View File

@@ -949,12 +949,6 @@ impl TenantManager {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
match attach_conf.generation.cmp(&tenant.generation) {
Ordering::Equal => {
if attach_conf.attach_mode == AttachmentMode::Single {
tenant
.gc_block
.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
}
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.

View File

@@ -276,16 +276,6 @@ pub(crate) enum LayerId {
InMemoryLayerId(InMemoryLayerFileId),
}
/// Uniquely identify a layer visit by the layer
/// and LSN floor (or start LSN) of the reads.
/// The layer itself is not enough since we may
/// have different LSN lower bounds for delta layer reads.
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
struct LayerToVisitId {
layer_id: LayerId,
lsn_floor: Lsn,
}
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
@@ -297,9 +287,9 @@ pub(crate) enum ReadableLayer {
/// A partial description of a read to be done.
#[derive(Debug, Clone)]
struct LayerVisit {
struct ReadDesc {
/// An id used to resolve the readable layer within the fringe
layer_to_visit_id: LayerToVisitId,
layer_id: LayerId,
/// Lsn range for the read, used for selecting the next read
lsn_range: Range<Lsn>,
}
@@ -313,12 +303,12 @@ struct LayerVisit {
/// a two layer indexing scheme.
#[derive(Debug)]
pub(crate) struct LayerFringe {
planned_visits_by_lsn: BinaryHeap<LayerVisit>,
visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
layers: HashMap<LayerId, LayerKeyspace>,
}
#[derive(Debug)]
struct LayerVisitReads {
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: KeySpaceRandomAccum,
}
@@ -326,23 +316,23 @@ struct LayerVisitReads {
impl LayerFringe {
pub(crate) fn new() -> Self {
LayerFringe {
planned_visits_by_lsn: BinaryHeap::new(),
visit_reads: HashMap::new(),
planned_reads_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
let read_desc = match self.planned_visits_by_lsn.pop() {
let read_desc = match self.planned_reads_by_lsn.pop() {
Some(desc) => desc,
None => return None,
};
let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
let removed = self.layers.remove_entry(&read_desc.layer_id);
match removed {
Some((
_,
LayerVisitReads {
LayerKeyspace {
layer,
mut target_keyspace,
},
@@ -361,24 +351,20 @@ impl LayerFringe {
keyspace: KeySpace,
lsn_range: Range<Lsn>,
) {
let layer_to_visit_id = LayerToVisitId {
layer_id: layer.id(),
lsn_floor: lsn_range.start,
};
let entry = self.visit_reads.entry(layer_to_visit_id.clone());
let layer_id = layer.id();
let entry = self.layers.entry(layer_id.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().target_keyspace.add_keyspace(keyspace);
}
Entry::Vacant(entry) => {
self.planned_visits_by_lsn.push(LayerVisit {
self.planned_reads_by_lsn.push(ReadDesc {
lsn_range,
layer_to_visit_id: layer_to_visit_id.clone(),
layer_id: layer_id.clone(),
});
let mut accum = KeySpaceRandomAccum::new();
accum.add_keyspace(keyspace);
entry.insert(LayerVisitReads {
entry.insert(LayerKeyspace {
layer,
target_keyspace: accum,
});
@@ -393,7 +379,7 @@ impl Default for LayerFringe {
}
}
impl Ord for LayerVisit {
impl Ord for ReadDesc {
fn cmp(&self, other: &Self) -> Ordering {
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
if ord == std::cmp::Ordering::Equal {
@@ -404,19 +390,19 @@ impl Ord for LayerVisit {
}
}
impl PartialOrd for LayerVisit {
impl PartialOrd for ReadDesc {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for LayerVisit {
impl PartialEq for ReadDesc {
fn eq(&self, other: &Self) -> bool {
self.lsn_range == other.lsn_range
}
}
impl Eq for LayerVisit {}
impl Eq for ReadDesc {}
impl ReadableLayer {
pub(crate) fn id(&self) -> LayerId {

View File

@@ -38,7 +38,7 @@ use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
};
use crate::tenant::PageReconstructError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::virtual_file::{self, VirtualFile};
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -58,6 +58,7 @@ use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tokio_stream::StreamExt;
use tracing::*;
@@ -69,7 +70,9 @@ use utils::{
};
use super::layer_name::ImageLayerName;
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
use super::{
AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
///
/// Header stored in the beginning of the file
@@ -797,9 +800,10 @@ impl ImageLayerWriterInner {
///
async fn finish(
self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Option<Key>,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
) -> anyhow::Result<ResidentLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -875,9 +879,12 @@ impl ImageLayerWriterInner {
// fsync the file
file.sync_all().await?;
trace!("created image layer {}", self.path);
// FIXME: why not carry the virtualfile here, it supports renaming?
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
Ok((desc, self.path))
info!("created image layer {}", layer.local_path());
Ok(layer)
}
}
@@ -956,18 +963,24 @@ impl ImageLayerWriter {
///
pub(crate) async fn finish(
mut self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
self.inner.take().unwrap().finish(ctx, None).await
) -> anyhow::Result<super::ResidentLayer> {
self.inner.take().unwrap().finish(timeline, ctx, None).await
}
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
pub(super) async fn finish_with_end_key(
mut self,
timeline: &Arc<Timeline>,
end_key: Key,
ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
self.inner.take().unwrap().finish(ctx, Some(end_key)).await
) -> anyhow::Result<super::ResidentLayer> {
self.inner
.take()
.unwrap()
.finish(timeline, ctx, Some(end_key))
.await
}
}
@@ -1071,7 +1084,7 @@ mod test {
tenant::{
config::TenantConf,
harness::{TenantHarness, TIMELINE_ID},
storage_layer::{Layer, ResidentLayer},
storage_layer::ResidentLayer,
vectored_blob_io::StreamingVectoredReadPlanner,
Tenant, Timeline,
},
@@ -1142,8 +1155,7 @@ mod test {
key = key.next();
}
let (desc, path) = writer.finish(&ctx).await.unwrap();
Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
writer.finish(&timeline, &ctx).await.unwrap()
};
let original_size = resident.metadata().file_size;
@@ -1205,9 +1217,7 @@ mod test {
.await
.unwrap();
let replacement = if wrote_keys > 0 {
let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
Some(resident)
Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
} else {
None
};
@@ -1280,8 +1290,7 @@ mod test {
for (key, img) in images {
writer.put_image(key, img, ctx).await?;
}
let (desc, path) = writer.finish(ctx).await?;
let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
let img_layer = writer.finish(tline, ctx).await?;
Ok::<_, anyhow::Error>(img_layer)
}

View File

@@ -439,30 +439,11 @@ impl Layer {
fn record_access(&self, ctx: &RequestContext) {
if self.0.access_stats.record_access(ctx) {
// Visibility was modified to Visible: maybe log about this
match ctx.task_kind() {
TaskKind::CalculateSyntheticSize
| TaskKind::GarbageCollector
| TaskKind::MgmtRequest => {
// This situation is expected in code paths do binary searches of the LSN space to resolve
// an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
// and on-demand for certain HTTP API requests.
}
_ => {
// In all other contexts, it is unusual to do I/O involving layers which are not visible at
// some branch tip, so we log the fact that we are accessing something that the visibility
// calculation thought should not be visible.
//
// This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
// which was covered by a concurrent compaction.
tracing::info!(
"Layer {} became visible as a result of access",
self.0.desc.key()
);
}
}
// Update the timeline's visible bytes count
// Visibility was modified to Visible
tracing::info!(
"Layer {} became visible as a result of access",
self.0.desc.key()
);
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge

View File

@@ -1025,15 +1025,6 @@ fn access_stats() {
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
access_stats.set_visibility(LayerVisibilityHint::Visible);
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
// Recording access implicitly makes layer visible, if it wasn't already
let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
access_stats.set_visibility(LayerVisibilityHint::Covered);
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
assert!(access_stats.record_access_at(atime));
access_stats.set_visibility(LayerVisibilityHint::Visible);
assert!(!access_stats.record_access_at(atime));
access_stats.set_visibility(LayerVisibilityHint::Visible);
}
#[test]

View File

@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
self.generated_layers
.push(SplitWriterResult::Discarded(layer_key));
} else {
let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
self.generated_layers
.push(SplitWriterResult::Produced(layer));
self.generated_layers.push(SplitWriterResult::Produced(
prev_image_writer
.finish_with_end_key(tline, key, ctx)
.await?,
));
}
}
self.inner.put_image(key, img, ctx).await
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
if discard(&layer_key).await {
generated_layers.push(SplitWriterResult::Discarded(layer_key));
} else {
let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
generated_layers.push(SplitWriterResult::Produced(layer));
generated_layers.push(SplitWriterResult::Produced(
inner.finish_with_end_key(tline, end_key, ctx).await?,
));
}
Ok(generated_layers)
}

View File

@@ -163,6 +163,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
// How many errors we have seen consequtively
let mut error_run_count = 0;
let mut last_throttle_flag_reset_at = Instant::now();
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -189,6 +191,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
}
let sleep_duration;
if period == Duration::ZERO {
#[cfg(not(feature = "testing"))]
@@ -203,18 +207,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
};
// Run compaction
let IterationResult { output, elapsed } = iteration
.run(tenant.compaction_iteration(&cancel, &ctx))
.await;
let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
match output {
Ok(has_pending_task) => {
error_run_count = 0;
// schedule the next compaction immediately in case there is a pending compaction task
sleep_duration = if has_pending_task {
Duration::ZERO
} else {
period
};
sleep_duration = if has_pending_task { Duration::ZERO } else { period };
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -235,20 +233,38 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
// the duration is recorded by performance tests by enabling debug in this function
tracing::debug!(
elapsed_ms = elapsed.as_millis(),
"compaction iteration complete"
);
tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
};
// Perhaps we did no work and the walredo process has been idle for some time:
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
// TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
if let Some(walredo_mgr) = &tenant.walredo_mgr {
walredo_mgr.maybe_quiesce(period * 10);
}
// TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
let now = Instant::now();
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
if count_throttled == 0 {
return;
}
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
let delta = now - prev;
info!(
n_seconds=%format_args!("{:.3}",
delta.as_secs_f64()),
count_accounted,
count_throttled,
sum_throttled_usecs,
allowed_rps=%format_args!("{allowed_rps:.0}"),
"shard was throttled in the last n_seconds"
);
});
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
@@ -330,7 +346,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let mut first = true;
tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
loop {
tokio::select! {
_ = cancel.cancelled() => {
@@ -348,6 +363,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
first = false;
let delays = async {
delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
random_init_delay(period, &cancel).await?;
Ok::<_, Cancelled>(())
};
@@ -421,7 +437,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let mut last_throttle_flag_reset_at = Instant::now();
loop {
tokio::select! {
_ = cancel.cancelled() => {
@@ -468,29 +483,6 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
kind: BackgroundLoopKind::IngestHouseKeeping,
};
iteration.run(tenant.ingest_housekeeping()).await;
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
// Or just spawn another background loop for this throttle, it's not like it's super costly.
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
let now = Instant::now();
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
if count_throttled == 0 {
return;
}
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
let delta = now - prev;
info!(
n_seconds=%format_args!("{:.3}",
delta.as_secs_f64()),
count_accounted = count_accounted_finish, // don't break existing log scraping
count_throttled,
sum_throttled_usecs,
count_accounted_start, // log after pre-existing fields to not break existing log scraping
allowed_rps=%format_args!("{allowed_rps:.0}"),
"shard was throttled in the last n_seconds"
);
});
}
}
.await;
@@ -546,12 +538,28 @@ pub(crate) async fn random_init_delay(
let mut rng = rand::thread_rng();
rng.gen_range(Duration::ZERO..=period)
};
match tokio::time::timeout(d, cancel.cancelled()).await {
Ok(_) => Err(Cancelled),
Err(_) => Ok(()),
}
}
/// Delays GC by defaul lease length at restart.
///
/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
/// length, we gurantees that all the leases we granted before the restart will expire
/// when we run GC for the first time after the restart.
pub(crate) async fn delay_by_lease_length(
length: Duration,
cancel: &CancellationToken,
) -> Result<(), Cancelled> {
match tokio::time::timeout(length, cancel.cancelled()).await {
Ok(_) => Err(Cancelled),
Err(_) => Ok(()),
}
}
struct Iteration {
started_at: Instant,
period: Duration,

View File

@@ -24,10 +24,8 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
pub struct Throttle<M: Metric> {
inner: ArcSwap<Inner>,
metric: M,
/// will be turned into [`Stats::count_accounted_start`]
count_accounted_start: AtomicU64,
/// will be turned into [`Stats::count_accounted_finish`]
count_accounted_finish: AtomicU64,
/// will be turned into [`Stats::count_accounted`]
count_accounted: AtomicU64,
/// will be turned into [`Stats::count_throttled`]
count_throttled: AtomicU64,
/// will be turned into [`Stats::sum_throttled_usecs`]
@@ -45,21 +43,17 @@ pub struct Observation {
pub wait_time: Duration,
}
pub trait Metric {
fn accounting_start(&self);
fn accounting_finish(&self);
fn observe_throttling(&self, observation: &Observation);
}
/// See [`Throttle::reset_stats`].
pub struct Stats {
/// Number of requests that started [`Throttle::throttle`] calls.
pub count_accounted_start: u64,
/// Number of requests that finished [`Throttle::throttle`] calls.
pub count_accounted_finish: u64,
/// Subset of the `accounted` requests that were actually throttled.
/// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
// Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
pub count_accounted: u64,
// Subset of the `accounted` requests that were actually throttled.
// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
pub count_throttled: u64,
/// Sum of microseconds that throttled requests spent waiting for throttling.
// Sum of microseconds that throttled requests spent waiting for throttling.
pub sum_throttled_usecs: u64,
}
@@ -71,8 +65,7 @@ where
Self {
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
metric,
count_accounted_start: AtomicU64::new(0),
count_accounted_finish: AtomicU64::new(0),
count_accounted: AtomicU64::new(0),
count_throttled: AtomicU64::new(0),
sum_throttled_usecs: AtomicU64::new(0),
}
@@ -124,13 +117,11 @@ where
/// This method allows retrieving & resetting that flag.
/// Useful for periodic reporting.
pub fn reset_stats(&self) -> Stats {
let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
Stats {
count_accounted_start,
count_accounted_finish,
count_accounted,
count_throttled,
sum_throttled_usecs,
}
@@ -148,12 +139,9 @@ where
};
let start = std::time::Instant::now();
self.metric.accounting_start();
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
let did_throttle = inner.rate_limiter.acquire(key_count).await;
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
self.metric.accounting_finish();
self.count_accounted.fetch_add(1, Ordering::Relaxed);
if did_throttle {
self.count_throttled.fetch_add(1, Ordering::Relaxed);
let now = Instant::now();

View File

@@ -196,8 +196,9 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: RemoteTimelineClient,
pub timeline_get_throttle:
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
pub timeline_get_throttle: Arc<
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
}
@@ -405,8 +406,9 @@ pub struct Timeline {
gc_lock: tokio::sync::Mutex<()>,
/// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
timeline_get_throttle:
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
timeline_get_throttle: Arc<
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
>,
/// Keep aux directory cache to avoid it's reconstruction on each update
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
@@ -4011,9 +4013,7 @@ impl Timeline {
if wrote_keys {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
let (desc, path) = image_layer_writer.finish(ctx).await?;
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("created image layer for rel {}", image_layer.local_path());
let image_layer = image_layer_writer.finish(self, ctx).await?;
Ok(ImageLayerCreationOutcome {
image: Some(image_layer),
next_start_key: img_range.end,
@@ -4101,12 +4101,7 @@ impl Timeline {
if wrote_any_image {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
let (desc, path) = image_layer_writer.finish(ctx).await?;
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!(
"created image layer for metadata {}",
image_layer.local_path()
);
let image_layer = image_layer_writer.finish(self, ctx).await?;
Ok(ImageLayerCreationOutcome {
image: Some(image_layer),
next_start_key: img_range.end,
@@ -4314,9 +4309,7 @@ impl Timeline {
timer.stop_and_record();
// Creating image layers may have caused some previously visible layers to be covered
if !image_layers.is_empty() {
self.update_layer_visibility().await?;
}
self.update_layer_visibility().await?;
Ok(image_layers)
}
@@ -5378,8 +5371,7 @@ impl Timeline {
/// Force create an image layer and place it into the layer map.
///
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
/// placed into the layer map in one run AND be validated.
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
#[cfg(test)]
pub(super) async fn force_create_image_layer(
self: &Arc<Timeline>,
@@ -5411,9 +5403,8 @@ impl Timeline {
for (key, img) in images {
image_layer_writer.put_image(key, img, ctx).await?;
}
let (desc, path) = image_layer_writer.finish(ctx).await?;
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("force created image layer {}", image_layer.local_path());
let image_layer = image_layer_writer.finish(self, ctx).await?;
{
let mut guard = self.layers.write().await;
guard.open_mut().unwrap().force_insert_layer(image_layer);
@@ -5425,8 +5416,7 @@ impl Timeline {
/// Force create a delta layer and place it into the layer map.
///
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
/// placed into the layer map in one run AND be validated.
/// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
#[cfg(test)]
pub(super) async fn force_create_delta_layer(
self: &Arc<Timeline>,
@@ -5452,6 +5442,33 @@ impl Timeline {
if let Some(check_start_lsn) = check_start_lsn {
assert!(deltas.lsn_range.start >= check_start_lsn);
}
// check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
// layers of the same start/end LSN, and so should the force inserted layer
{
/// Checks if a overlaps with b, assume a/b = [start, end).
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
!(a.end <= b.start || b.end <= a.start)
}
if deltas.key_range.start.next() != deltas.key_range.end {
let guard = self.layers.read().await;
let mut invalid_layers =
guard.layer_map()?.iter_historic_layers().filter(|layer| {
layer.is_delta()
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
&& layer.lsn_range != deltas.lsn_range
// skip single-key layer files
&& layer.key_range.start.next() != layer.key_range.end
});
if let Some(layer) = invalid_layers.next() {
// If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
panic!(
"inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
);
}
}
}
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
@@ -5466,7 +5483,7 @@ impl Timeline {
}
let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("force created delta layer {}", delta_layer.local_path());
{
let mut guard = self.layers.write().await;
guard.open_mut().unwrap().force_insert_layer(delta_layer);

View File

@@ -29,7 +29,6 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
use crate::tenant::storage_layer::split_writer::{
@@ -564,12 +563,10 @@ impl Timeline {
.await?;
if keys_written > 0 {
let (desc, path) = image_layer_writer
.finish(ctx)
let new_layer = image_layer_writer
.finish(self, ctx)
.await
.map_err(CompactionError::Other)?;
let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
.map_err(CompactionError::Other)?;
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
layer.metadata().file_size,
new_layer.metadata().file_size);
@@ -1789,12 +1786,20 @@ impl Timeline {
stat.visit_image_layer(desc.file_size());
}
}
let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
.iter()
.map(|layer| layer.layer_desc().layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!("cannot run gc-compaction because {}", err);
for layer in &layer_selection {
let desc = layer.layer_desc();
let key_range = &desc.key_range;
if desc.is_delta() && key_range.start.next() != key_range.end {
let lsn_range = desc.lsn_range.clone();
let intersects = lsn_split_point.range(lsn_range).collect_vec();
if intersects.len() > 1 {
bail!(
"cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
desc.key(),
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
);
}
}
}
// The maximum LSN we are processing in this compaction loop
let end_lsn = layer_selection

View File

@@ -135,6 +135,25 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
.context("delete_all")
}
// This function removs remaining traces of a timeline on disk.
// Namely: metadata file, timeline directory, delete mark.
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
// delete mark should be present because it is the last step during deletion.
// (nothing can fail after its deletion)
async fn cleanup_remaining_timeline_fs_traces(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
// Remove delete mark
// TODO: once we are confident that no more exist in the field, remove this
// line. It cleans up a legacy marker file that might in rare cases be present.
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("remove delete mark")
}
/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
async fn remove_timeline_from_tenant(
@@ -175,10 +194,12 @@ async fn remove_timeline_from_tenant(
/// 7. Delete mark file
///
/// It is resumable from any step in case a crash/restart occurs.
/// There are two entrypoints to the process:
/// There are three entrypoints to the process:
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
/// and we possibly neeed to continue deletion of remote files.
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
/// index but still have local metadata, timeline directory and delete mark.
///
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
#[derive(Default)]
@@ -290,6 +311,18 @@ impl DeleteTimelineFlow {
Ok(())
}
#[instrument(skip_all, fields(%timeline_id))]
pub async fn cleanup_remaining_timeline_fs_traces(
tenant: &Tenant,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
let r =
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
.await;
info!("Done");
r
}
fn prepare(
tenant: &Tenant,
timeline_id: TimelineId,

View File

@@ -35,7 +35,6 @@ use anyhow::Context;
use bytes::{Bytes, BytesMut};
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
use pageserver_api::shard::TenantShardId;
use std::future::Future;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
@@ -205,22 +204,6 @@ impl PostgresRedoManager {
}
}
/// Do a ping request-response roundtrip.
///
/// Not used in production, but by Rust benchmarks.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
self.do_with_walredo_process(pg_version, |proc| async move {
proc.ping(Duration::from_secs(1))
.await
.map_err(Error::Other)
})
.await
}
pub fn status(&self) -> WalRedoManagerStatus {
WalRedoManagerStatus {
last_redo_at: {
@@ -313,100 +296,6 @@ impl PostgresRedoManager {
}
}
/// # Cancel-Safety
///
/// This method is cancel-safe iff `closure` is cancel-safe.
async fn do_with_walredo_process<
F: FnOnce(Arc<Process>) -> Fut,
Fut: Future<Output = Result<O, Error>>,
O,
>(
&self,
pg_version: u32,
closure: F,
) -> Result<O, Error> {
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
Ok(guard) => match &*guard {
ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
ProcessOnceCell::ManagerShutDown => {
return Err(Error::Cancelled);
}
},
Err(permit) => {
let start = Instant::now();
// acquire guard before spawning process, so that we don't spawn new processes
// if the gate is already closed.
let _launched_processes_guard = match self.launched_processes.enter() {
Ok(guard) => guard,
Err(GateError::GateClosed) => unreachable!(
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
),
};
let proc = Arc::new(Process {
process: process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
_launched_processes_guard,
});
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!(
elapsed_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
self.redo_process
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
proc
}
};
// async closures are unstable, would support &Process
let result = closure(proc.clone()).await;
if result.is_err() {
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
// Note that there may be other tasks concurrent with us that also hold `proc`.
// We have to deal with that here.
// Also read the doc comment on field `self.redo_process`.
//
// NB: there may still be other concurrent threads using `proc`.
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
//
// NB: the drop impl blocks the dropping thread with a wait() system call for
// the child process. In some ways the blocking is actually good: if we
// deferred the waiting into the background / to tokio if we used `tokio::process`,
// it could happen that if walredo always fails immediately, we spawn processes faster
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
// This probably needs revisiting at some later point.
match self.redo_process.get() {
None => (),
Some(guard) => {
match &*guard {
ProcessOnceCell::ManagerShutDown => {}
ProcessOnceCell::Spawned(guard_proc) => {
if Arc::ptr_eq(&proc, guard_proc) {
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
guard.take_and_deinit();
} else {
// Another task already spawned another redo process (further up in this method)
// and put it into `redo_process`. Do nothing, our view of the world is behind.
}
}
}
}
}
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
drop(proc);
}
result
}
///
/// Process one request for WAL redo using wal-redo postgres
///
@@ -430,63 +319,130 @@ impl PostgresRedoManager {
const MAX_RETRY_ATTEMPTS: u32 = 1;
let mut n_attempts = 0u32;
loop {
let base_img = &base_img;
let closure = |proc: Arc<Process>| async move {
let started_at = std::time::Instant::now();
// Relational WAL records are applied using wal-redo-postgres
let result = proc
.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
.context("apply_wal_records");
let duration = started_at.elapsed();
let len = records.len();
let nbytes = records.iter().fold(0, |acumulator, record| {
acumulator
+ match &record.1 {
NeonWalRecord::Postgres { rec, .. } => rec.len(),
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
}
});
WAL_REDO_TIME.observe(duration.as_secs_f64());
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
debug!(
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
len,
nbytes,
duration.as_micros(),
lsn
);
if let Err(e) = result.as_ref() {
error!(
"error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn,
n_attempts,
e,
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
Ok(guard) => match &*guard {
ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
ProcessOnceCell::ManagerShutDown => {
return Err(Error::Cancelled);
}
},
Err(permit) => {
let start = Instant::now();
// acquire guard before spawning process, so that we don't spawn new processes
// if the gate is already closed.
let _launched_processes_guard = match self.launched_processes.enter() {
Ok(guard) => guard,
Err(GateError::GateClosed) => unreachable!(
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
),
};
let proc = Arc::new(Process {
process: process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
_launched_processes_guard,
});
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!(
duration_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
self.redo_process
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
proc
}
result.map_err(Error::Other)
};
let result = self.do_with_walredo_process(pg_version, closure).await;
if result.is_ok() && n_attempts != 0 {
let started_at = std::time::Instant::now();
// Relational WAL records are applied using wal-redo-postgres
let result = proc
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
.await
.context("apply_wal_records");
let duration = started_at.elapsed();
let len = records.len();
let nbytes = records.iter().fold(0, |acumulator, record| {
acumulator
+ match &record.1 {
NeonWalRecord::Postgres { rec, .. } => rec.len(),
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
}
});
WAL_REDO_TIME.observe(duration.as_secs_f64());
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
debug!(
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
len,
nbytes,
duration.as_micros(),
lsn
);
// If something went wrong, don't try to reuse the process. Kill it, and
// next request will launch a new one.
if let Err(e) = result.as_ref() {
error!(
"error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn,
n_attempts,
e,
);
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
// Note that there may be other tasks concurrent with us that also hold `proc`.
// We have to deal with that here.
// Also read the doc comment on field `self.redo_process`.
//
// NB: there may still be other concurrent threads using `proc`.
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
//
// NB: the drop impl blocks the dropping thread with a wait() system call for
// the child process. In some ways the blocking is actually good: if we
// deferred the waiting into the background / to tokio if we used `tokio::process`,
// it could happen that if walredo always fails immediately, we spawn processes faster
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
// This probably needs revisiting at some later point.
match self.redo_process.get() {
None => (),
Some(guard) => {
match &*guard {
ProcessOnceCell::ManagerShutDown => {}
ProcessOnceCell::Spawned(guard_proc) => {
if Arc::ptr_eq(&proc, guard_proc) {
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
guard.take_and_deinit();
} else {
// Another task already spawned another redo process (further up in this method)
// and put it into `redo_process`. Do nothing, our view of the world is behind.
}
}
}
}
}
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
drop(proc);
} else if n_attempts != 0 {
info!(n_attempts, "retried walredo succeeded");
}
n_attempts += 1;
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
return result;
return result.map_err(Error::Other);
}
}
}
@@ -556,17 +512,6 @@ mod tests {
use tracing::Instrument;
use utils::{id::TenantId, lsn::Lsn};
#[tokio::test]
async fn test_ping() {
let h = RedoHarness::new().unwrap();
h.manager
.ping(14)
.instrument(h.span())
.await
.expect("ping should work");
}
#[tokio::test]
async fn short_v14_redo() {
let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();

View File

@@ -6,7 +6,6 @@ use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
page_cache::PAGE_SZ,
span::debug_assert_current_span_has_tenant_id,
walrecord::NeonWalRecord,
};
@@ -238,26 +237,6 @@ impl WalRedoProcess {
res
}
/// Do a ping request-response roundtrip.
///
/// Not used in production, but by Rust benchmarks.
pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
let mut writebuf: Vec<u8> = Vec::with_capacity(4);
protocol::build_ping_msg(&mut writebuf);
let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
else {
anyhow::bail!("WAL redo ping timed out");
};
let response = res?;
if response.len() != PAGE_SZ {
anyhow::bail!(
"WAL redo ping response should respond with page-sized response: {}",
response.len()
);
}
Ok(())
}
/// # Cancel-Safety
///
/// When not polled to completion (e.g. because in `tokio::select!` another

View File

@@ -55,8 +55,3 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
tag.ser_into(buf)
.expect("serialize BufferTag should always succeed");
}
pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
buf.put_u8(b'H');
buf.put_u32(4);
}

View File

@@ -24,7 +24,6 @@
* PushPage ('P'): Copy a page image (in the payload) to buffer cache
* ApplyRecord ('A'): Apply a WAL record (in the payload)
* GetPage ('G'): Return a page image from buffer cache.
* Ping ('H'): Return the input message.
*
* Currently, you only get a response to GetPage requests; the response is
* simply a 8k page, without any headers. Errors are logged to stderr.
@@ -134,7 +133,6 @@ static void ApplyRecord(StringInfo input_message);
static void apply_error_callback(void *arg);
static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
static void GetPage(StringInfo input_message);
static void Ping(StringInfo input_message);
static ssize_t buffered_read(void *buf, size_t count);
static void CreateFakeSharedMemoryAndSemaphores();
@@ -396,10 +394,6 @@ WalRedoMain(int argc, char *argv[])
GetPage(&input_message);
break;
case 'H': /* Ping */
Ping(&input_message);
break;
/*
* EOF means we're done. Perform normal shutdown.
*/
@@ -1063,36 +1057,6 @@ GetPage(StringInfo input_message)
}
static void
Ping(StringInfo input_message)
{
int tot_written;
/* Response: the input message */
tot_written = 0;
do {
ssize_t rc;
/* We don't need alignment, but it's bad practice to use char[BLCKSZ] */
#if PG_VERSION_NUM >= 160000
static const PGIOAlignedBlock response;
#else
static const PGAlignedBlock response;
#endif
rc = write(STDOUT_FILENO, &response.data[tot_written], BLCKSZ - tot_written);
if (rc < 0) {
/* If interrupted by signal, just retry */
if (errno == EINTR)
continue;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write to stdout: %m")));
}
tot_written += rc;
} while (tot_written < BLCKSZ);
elog(TRACE, "Page sent back for ping");
}
/* Buffer used by buffered_read() */
static char stdin_buf[16 * 1024];
static size_t stdin_len = 0; /* # of bytes in buffer */

View File

@@ -18,6 +18,7 @@ atomic-take.workspace = true
aws-config.workspace = true
aws-sdk-iam.workspace = true
aws-sigv4.workspace = true
aws-types.workspace = true
base64.workspace = true
bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
@@ -25,6 +26,7 @@ camino.workspace = true
chrono.workspace = true
clap.workspace = true
consumption_metrics.workspace = true
crossbeam-deque.workspace = true
dashmap.workspace = true
env_logger.workspace = true
framed-websockets.workspace = true
@@ -46,9 +48,11 @@ indexmap.workspace = true
ipnet.workspace = true
itertools.workspace = true
lasso = { workspace = true, features = ["multi-threaded"] }
md5.workspace = true
measured = { workspace = true, features = ["lasso"] }
metrics.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
parking_lot.workspace = true
parquet.workspace = true
parquet_derive.workspace = true
@@ -63,6 +67,7 @@ reqwest.workspace = true
reqwest-middleware = { workspace = true, features = ["json"] }
reqwest-retry.workspace = true
reqwest-tracing.workspace = true
routerify.workspace = true
rustc-hash.workspace = true
rustls-pemfile.workspace = true
rustls.workspace = true
@@ -74,6 +79,7 @@ smol_str.workspace = true
smallvec.workspace = true
socket2.workspace = true
subtle.workspace = true
task-local-extensions.workspace = true
thiserror.workspace = true
tikv-jemallocator.workspace = true
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
@@ -82,6 +88,7 @@ tokio-postgres-rustls.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
tower-service.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true

View File

@@ -163,7 +163,6 @@ impl ComputeUserInfo {
}
pub(crate) enum ComputeCredentialKeys {
#[cfg(any(test, feature = "testing"))]
Password(Vec<u8>),
AuthKeys(AuthKeys),
None,
@@ -294,10 +293,16 @@ async fn auth_quirks(
// We now expect to see a very specific payload in the place of password.
let (info, unauthenticated_password) = match user_info.try_into() {
Err(info) => {
let (info, password) =
hacks::password_hack_no_authentication(ctx, info, client).await?;
ctx.set_endpoint_id(info.endpoint.clone());
(info, Some(password))
let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
ctx.set_endpoint_id(res.info.endpoint.clone());
let password = match res.keys {
ComputeCredentialKeys::Password(p) => p,
ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
unreachable!("password hack should return a password")
}
};
(res.info, Some(password))
}
Ok(info) => (info, None),
};

View File

@@ -1,4 +1,6 @@
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
use super::{
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
};
use crate::{
auth::{self, AuthFlow},
config::AuthenticationConfig,
@@ -61,7 +63,7 @@ pub(crate) async fn password_hack_no_authentication(
ctx: &RequestMonitoring,
info: ComputeUserInfoNoEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
) -> auth::Result<ComputeCredentials> {
warn!("project not specified, resorting to the password hack auth flow");
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -77,12 +79,12 @@ pub(crate) async fn password_hack_no_authentication(
info!(project = &*payload.endpoint, "received missing parameter");
// Report tentative success; compute node will check the password anyway.
Ok((
ComputeUserInfo {
Ok(ComputeCredentials {
info: ComputeUserInfo {
user: info.user,
options: info.options,
endpoint: payload.endpoint,
},
payload.password,
))
keys: ComputeCredentialKeys::Password(payload.password),
})
}

View File

@@ -25,8 +25,6 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
fn fetch_auth_rules(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
role_name: RoleName,
) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
}
@@ -103,9 +101,7 @@ impl JwkCacheEntryLock {
async fn renew_jwks<F: FetchAuthRules>(
&self,
_permit: JwkRenewalPermit<'_>,
ctx: &RequestMonitoring,
client: &reqwest::Client,
endpoint: EndpointId,
role_name: RoleName,
auth_rules: &F,
) -> anyhow::Result<Arc<JwkCacheEntry>> {
@@ -119,9 +115,7 @@ impl JwkCacheEntryLock {
}
}
let rules = auth_rules
.fetch_auth_rules(ctx, endpoint, role_name)
.await?;
let rules = auth_rules.fetch_auth_rules(role_name).await?;
let mut key_sets =
ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
// TODO(conrad): run concurrently
@@ -172,7 +166,6 @@ impl JwkCacheEntryLock {
self: &Arc<Self>,
ctx: &RequestMonitoring,
client: &reqwest::Client,
endpoint: EndpointId,
role_name: RoleName,
fetch: &F,
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
@@ -183,9 +176,7 @@ impl JwkCacheEntryLock {
let Some(cached) = guard else {
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let permit = self.acquire_permit().await;
return self
.renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
.await;
return self.renew_jwks(permit, client, role_name, fetch).await;
};
let last_update = now.duration_since(cached.last_retrieved);
@@ -196,9 +187,7 @@ impl JwkCacheEntryLock {
let permit = self.acquire_permit().await;
// it's been too long since we checked the keys. wait for them to update.
return self
.renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
.await;
return self.renew_jwks(permit, client, role_name, fetch).await;
}
// every 5 minutes we should spawn a job to eagerly update the token.
@@ -209,12 +198,8 @@ impl JwkCacheEntryLock {
let entry = self.clone();
let client = client.clone();
let fetch = fetch.clone();
let ctx = ctx.clone();
tokio::spawn(async move {
if let Err(e) = entry
.renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
.await
{
if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
tracing::warn!(error=?e, "could not fetch JWKs in background job");
}
});
@@ -231,7 +216,6 @@ impl JwkCacheEntryLock {
ctx: &RequestMonitoring,
jwt: &str,
client: &reqwest::Client,
endpoint: EndpointId,
role_name: RoleName,
fetch: &F,
) -> Result<(), anyhow::Error> {
@@ -258,7 +242,7 @@ impl JwkCacheEntryLock {
let kid = header.key_id.context("missing key id")?;
let mut guard = self
.get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
.get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
.await?;
// get the key from the JWKs if possible. If not, wait for the keys to update.
@@ -270,14 +254,7 @@ impl JwkCacheEntryLock {
let permit = self.acquire_permit().await;
guard = self
.renew_jwks(
permit,
ctx,
client,
endpoint.clone(),
role_name.clone(),
fetch,
)
.renew_jwks(permit, client, role_name.clone(), fetch)
.await?;
}
_ => {
@@ -341,7 +318,7 @@ impl JwkCache {
jwt: &str,
) -> Result<(), anyhow::Error> {
// try with just a read lock first
let key = (endpoint.clone(), role_name.clone());
let key = (endpoint, role_name.clone());
let entry = self.map.get(&key).as_deref().map(Arc::clone);
let entry = entry.unwrap_or_else(|| {
// acquire a write lock after to insert.
@@ -350,7 +327,7 @@ impl JwkCache {
});
entry
.check_jwt(ctx, jwt, &self.client, endpoint, role_name, fetch)
.check_jwt(ctx, jwt, &self.client, role_name, fetch)
.await
}
}
@@ -711,8 +688,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
impl FetchAuthRules for Fetch {
async fn fetch_auth_rules(
&self,
_ctx: &RequestMonitoring,
_endpoint: EndpointId,
_role_name: RoleName,
) -> anyhow::Result<Vec<AuthRule>> {
Ok(vec![
@@ -731,7 +706,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
}
let role_name = RoleName::from("user");
let endpoint = EndpointId::from("ep");
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
@@ -741,7 +715,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
&RequestMonitoring::test(),
&token,
&client,
endpoint.clone(),
role_name.clone(),
&Fetch(addr),
)

View File

@@ -9,9 +9,8 @@ use crate::{
messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
NodeInfo,
},
context::RequestMonitoring,
intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
EndpointId, RoleName,
RoleName,
};
use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
@@ -58,12 +57,7 @@ pub struct JwksRoleSettings {
}
impl FetchAuthRules for StaticAuthRules {
async fn fetch_auth_rules(
&self,
_ctx: &RequestMonitoring,
_endpoint: EndpointId,
role_name: RoleName,
) -> anyhow::Result<Vec<AuthRule>> {
async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
let mappings = JWKS_ROLE_MAP.load();
let role_mappings = mappings
.as_deref()

View File

@@ -92,12 +92,6 @@ struct SqlOverHttpArgs {
#[clap(long, default_value_t = 16)]
sql_over_http_cancel_set_shards: usize,
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
sql_over_http_max_request_size_bytes: u64,
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
sql_over_http_max_response_size_bytes: usize,
}
#[tokio::main]
@@ -214,8 +208,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
},
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
};
Ok(Box::leak(Box::new(ProxyConfig {

View File

@@ -62,13 +62,12 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[derive(Clone, Debug, ValueEnum)]
enum AuthBackendType {
Console,
#[cfg(feature = "testing")]
Postgres,
// clap only shows the name, not the alias, in usage text.
// TODO: swap name/alias and deprecate "link"
#[value(name("link"), alias("web"))]
Web,
#[cfg(feature = "testing")]
Postgres,
}
/// Neon proxy/router
@@ -269,12 +268,6 @@ struct SqlOverHttpArgs {
#[clap(long, default_value_t = 64)]
sql_over_http_cancel_set_shards: usize,
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
sql_over_http_max_request_size_bytes: u64,
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
sql_over_http_max_response_size_bytes: usize,
}
#[tokio::main]
@@ -640,19 +633,17 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let api = console::provider::ConsoleBackend::Console(api);
auth::Backend::Console(MaybeOwned::Owned(api), ())
}
AuthBackendType::Web => {
let url = args.uri.parse()?;
auth::Backend::Web(MaybeOwned::Owned(url), ())
}
#[cfg(feature = "testing")]
AuthBackendType::Postgres => {
let url = args.auth_endpoint.parse()?;
let api = console::provider::mock::Api::new(url, !args.is_private_access_proxy);
let api = console::provider::mock::Api::new(url);
let api = console::provider::ConsoleBackend::Postgres(api);
auth::Backend::Console(MaybeOwned::Owned(api), ())
}
AuthBackendType::Web => {
let url = args.uri.parse()?;
auth::Backend::Web(MaybeOwned::Owned(url), ())
}
};
let config::ConcurrencyLockOptions {
@@ -688,8 +679,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
},
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
};
let authentication_config = AuthenticationConfig {
thread_pool,

View File

@@ -56,8 +56,6 @@ pub struct HttpConfig {
pub pool_options: GlobalConnPoolOptions,
pub cancel_set: CancelSet,
pub client_conn_threshold: u64,
pub max_request_size_bytes: u64,
pub max_response_size_bytes: usize,
}
pub struct AuthenticationConfig {

View File

@@ -303,7 +303,6 @@ impl NodeInfo {
pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
match keys {
#[cfg(any(test, feature = "testing"))]
ComputeCredentialKeys::Password(password) => self.config.password(password),
ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
ComputeCredentialKeys::None => &mut self.config,

View File

@@ -41,15 +41,11 @@ impl From<tokio_postgres::Error> for ApiError {
#[derive(Clone)]
pub struct Api {
endpoint: ApiUrl,
ip_allowlist_check_enabled: bool,
}
impl Api {
pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
Self {
endpoint,
ip_allowlist_check_enabled,
}
pub fn new(endpoint: ApiUrl) -> Self {
Self { endpoint }
}
pub(crate) fn url(&self) -> &str {
@@ -68,7 +64,6 @@ impl Api {
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
tokio::spawn(connection);
let secret = if let Some(entry) = get_execute_postgres_query(
&client,
"select rolpassword from pg_catalog.pg_authid where rolname = $1",
@@ -84,26 +79,21 @@ impl Api {
warn!("user '{}' does not exist", user_info.user);
None
};
let allowed_ips = if self.ip_allowlist_check_enabled {
match get_execute_postgres_query(
&client,
"select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
&[&user_info.endpoint.as_str()],
"allowed_ips",
)
.await?
{
Some(s) => {
info!("got allowed_ips: {s}");
s.split(',')
.map(|s| IpPattern::from_str(s).unwrap())
.collect()
}
None => vec![],
let allowed_ips = match get_execute_postgres_query(
&client,
"select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
&[&user_info.endpoint.as_str()],
"allowed_ips",
)
.await?
{
Some(s) => {
info!("got allowed_ips: {s}");
s.split(',')
.map(|s| IpPattern::from_str(s).unwrap())
.collect()
}
} else {
vec![]
None => vec![],
};
Ok((secret, allowed_ips))

View File

@@ -79,40 +79,6 @@ pub(crate) enum AuthMethod {
Cleartext,
}
impl Clone for RequestMonitoring {
fn clone(&self) -> Self {
let inner = self.0.try_lock().expect("should not deadlock");
let new = RequestMonitoringInner {
peer_addr: inner.peer_addr,
session_id: inner.session_id,
protocol: inner.protocol,
first_packet: inner.first_packet,
region: inner.region,
span: info_span!("background_task"),
project: inner.project,
branch: inner.branch,
endpoint_id: inner.endpoint_id.clone(),
dbname: inner.dbname.clone(),
user: inner.user.clone(),
application: inner.application.clone(),
error_kind: inner.error_kind,
auth_method: inner.auth_method.clone(),
success: inner.success,
rejected: inner.rejected,
cold_start_info: inner.cold_start_info,
pg_options: inner.pg_options.clone(),
sender: None,
disconnect_sender: None,
latency_timer: LatencyTimer::noop(inner.protocol),
disconnect_timestamp: inner.disconnect_timestamp,
};
Self(TryLock::new(new))
}
}
impl RequestMonitoring {
pub fn new(
session_id: Uuid,

View File

@@ -397,8 +397,6 @@ pub struct LatencyTimer {
protocol: Protocol,
cold_start_info: ColdStartInfo,
outcome: ConnectOutcome,
skip_reporting: bool,
}
impl LatencyTimer {
@@ -411,20 +409,6 @@ impl LatencyTimer {
cold_start_info: ColdStartInfo::Unknown,
// assume failed unless otherwise specified
outcome: ConnectOutcome::Failed,
skip_reporting: false,
}
}
pub(crate) fn noop(protocol: Protocol) -> Self {
Self {
start: time::Instant::now(),
stop: None,
accumulated: Accumulated::default(),
protocol,
cold_start_info: ColdStartInfo::Unknown,
// assume failed unless otherwise specified
outcome: ConnectOutcome::Failed,
skip_reporting: true,
}
}
@@ -459,10 +443,6 @@ pub enum ConnectOutcome {
impl Drop for LatencyTimer {
fn drop(&mut self) {
if self.skip_reporting {
return;
}
let duration = self
.stop
.unwrap_or_else(time::Instant::now)

View File

@@ -27,7 +27,7 @@ use crate::{
Host,
};
use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
pub(crate) struct PoolingBackend {
pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -274,6 +274,13 @@ impl ConnectMechanism for TokioMechanism {
.dbname(&self.conn_info.dbname)
.connect_timeout(timeout);
match &self.conn_info.auth {
AuthData::Jwt(_) => {}
AuthData::Password(pw) => {
config.password(pw);
}
}
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
let res = config.connect(tokio_postgres::NoTls).await;
drop(pause);

View File

@@ -29,16 +29,11 @@ use tracing::{info, info_span, Instrument};
use super::backend::HttpConnError;
#[derive(Debug, Clone)]
pub(crate) struct ConnInfoWithAuth {
pub(crate) conn_info: ConnInfo,
pub(crate) auth: AuthData,
}
#[derive(Debug, Clone)]
pub(crate) struct ConnInfo {
pub(crate) user_info: ComputeUserInfo,
pub(crate) dbname: DbName,
pub(crate) auth: AuthData,
}
#[derive(Debug, Clone)]
@@ -781,8 +776,6 @@ mod tests {
},
cancel_set: CancelSet::new(0),
client_conn_threshold: u64::MAX,
max_request_size_bytes: u64::MAX,
max_response_size_bytes: usize::MAX,
}));
let pool = GlobalConnPool::new(config);
let conn_info = ConnInfo {
@@ -792,6 +785,7 @@ mod tests {
options: NeonOptions::default(),
},
dbname: "dbname".into(),
auth: AuthData::Password("password".as_bytes().into()),
};
let ep_pool = Arc::downgrade(
&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -849,6 +843,7 @@ mod tests {
options: NeonOptions::default(),
},
dbname: "dbname".into(),
auth: AuthData::Password("password".as_bytes().into()),
};
let ep_pool = Arc::downgrade(
&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),

View File

@@ -60,7 +60,6 @@ use super::backend::PoolingBackend;
use super::conn_pool::AuthData;
use super::conn_pool::Client;
use super::conn_pool::ConnInfo;
use super::conn_pool::ConnInfoWithAuth;
use super::http_util::json_response;
use super::json::json_to_pg_text;
use super::json::pg_text_row_to_json;
@@ -88,6 +87,9 @@ enum Payload {
Batch(BatchQueryData),
}
const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -149,7 +151,7 @@ fn get_conn_info(
ctx: &RequestMonitoring,
headers: &HeaderMap,
tls: Option<&TlsConfig>,
) -> Result<ConnInfoWithAuth, ConnInfoError> {
) -> Result<ConnInfo, ConnInfoError> {
// HTTP only uses cleartext (for now and likely always)
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -236,8 +238,11 @@ fn get_conn_info(
options: options.unwrap_or_default(),
};
let conn_info = ConnInfo { user_info, dbname };
Ok(ConnInfoWithAuth { conn_info, auth })
Ok(ConnInfo {
user_info,
dbname,
auth,
})
}
// TODO: return different http error codes
@@ -361,10 +366,10 @@ pub(crate) enum SqlOverHttpError {
ConnectCompute(#[from] HttpConnError),
#[error("{0}")]
ConnInfo(#[from] ConnInfoError),
#[error("request is too large (max is {0} bytes)")]
RequestTooLarge(u64),
#[error("response is too large (max is {0} bytes)")]
ResponseTooLarge(usize),
#[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
RequestTooLarge,
#[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
ResponseTooLarge,
#[error("invalid isolation level")]
InvalidIsolationLevel,
#[error("{0}")]
@@ -381,8 +386,8 @@ impl ReportableError for SqlOverHttpError {
SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User,
SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
SqlOverHttpError::RequestTooLarge => ErrorKind::User,
SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
SqlOverHttpError::Postgres(p) => p.get_error_kind(),
SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
@@ -397,8 +402,8 @@ impl UserFacingError for SqlOverHttpError {
SqlOverHttpError::ReadPayload(p) => p.to_string(),
SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
SqlOverHttpError::RequestTooLarge(_) => self.to_string(),
SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
SqlOverHttpError::RequestTooLarge => self.to_string(),
SqlOverHttpError::ResponseTooLarge => self.to_string(),
SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
SqlOverHttpError::Postgres(p) => p.to_string(),
SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
@@ -521,10 +526,7 @@ async fn handle_inner(
// TLS config should be there.
let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
info!(
user = conn_info.conn_info.user_info.user.as_str(),
"credentials"
);
info!(user = conn_info.user_info.user.as_str(), "credentials");
// Allow connection pooling only if explicitly requested
// or if we have decided that http pool is no longer opt-in
@@ -535,7 +537,7 @@ async fn handle_inner(
let request_content_length = match request.body().size_hint().upper() {
Some(v) => v,
None => config.http_config.max_request_size_bytes + 1,
None => MAX_REQUEST_SIZE + 1,
};
info!(request_content_length, "request size in bytes");
Metrics::get()
@@ -545,10 +547,8 @@ async fn handle_inner(
// we don't have a streaming request support yet so this is to prevent OOM
// from a malicious user sending an extremely large request body
if request_content_length > config.http_config.max_request_size_bytes {
return Err(SqlOverHttpError::RequestTooLarge(
config.http_config.max_request_size_bytes,
));
if request_content_length > MAX_REQUEST_SIZE {
return Err(SqlOverHttpError::RequestTooLarge);
}
let fetch_and_process_request = Box::pin(
@@ -569,20 +569,20 @@ async fn handle_inner(
.authenticate_with_password(
ctx,
&config.authentication_config,
&conn_info.conn_info.user_info,
&conn_info.user_info,
pw,
)
.await?
}
AuthData::Jwt(jwt) => {
backend
.authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt)
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
.await?
}
};
let client = backend
.connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool)
.connect_to_compute(ctx, conn_info, keys, !allow_pool)
.await?;
// not strictly necessary to mark success here,
// but it's just insurance for if we forget it somewhere else
@@ -612,10 +612,7 @@ async fn handle_inner(
// Now execute the query and return the result.
let json_output = match payload {
Payload::Single(stmt) => {
stmt.process(config, cancel, &mut client, parsed_headers)
.await?
}
Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
Payload::Batch(statements) => {
if parsed_headers.txn_read_only {
response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
@@ -631,7 +628,7 @@ async fn handle_inner(
}
statements
.process(config, cancel, &mut client, parsed_headers)
.process(cancel, &mut client, parsed_headers)
.await?
}
};
@@ -659,7 +656,6 @@ async fn handle_inner(
impl QueryData {
async fn process(
self,
config: &'static ProxyConfig,
cancel: CancellationToken,
client: &mut Client<tokio_postgres::Client>,
parsed_headers: HttpHeaders,
@@ -668,7 +664,7 @@ impl QueryData {
let cancel_token = inner.cancel_token();
let res = match select(
pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)),
pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
pin!(cancel.cancelled()),
)
.await
@@ -731,7 +727,6 @@ impl QueryData {
impl BatchQueryData {
async fn process(
self,
config: &'static ProxyConfig,
cancel: CancellationToken,
client: &mut Client<tokio_postgres::Client>,
parsed_headers: HttpHeaders,
@@ -756,52 +751,44 @@ impl BatchQueryData {
discard.discard();
})?;
let json_output = match query_batch(
config,
cancel.child_token(),
&transaction,
self,
parsed_headers,
)
.await
{
Ok(json_output) => {
info!("commit");
let status = transaction.commit().await.inspect_err(|_| {
// if we cannot commit - for now don't return connection to pool
// TODO: get a query status from the error
discard.discard();
})?;
discard.check_idle(status);
json_output
}
Err(SqlOverHttpError::Cancelled(_)) => {
if let Err(err) = cancel_token.cancel_query(NoTls).await {
tracing::error!(?err, "could not cancel query");
let json_output =
match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
Ok(json_output) => {
info!("commit");
let status = transaction.commit().await.inspect_err(|_| {
// if we cannot commit - for now don't return connection to pool
// TODO: get a query status from the error
discard.discard();
})?;
discard.check_idle(status);
json_output
}
// TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
discard.discard();
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
}
Err(err) => {
info!("rollback");
let status = transaction.rollback().await.inspect_err(|_| {
// if we cannot rollback - for now don't return connection to pool
// TODO: get a query status from the error
Err(SqlOverHttpError::Cancelled(_)) => {
if let Err(err) = cancel_token.cancel_query(NoTls).await {
tracing::error!(?err, "could not cancel query");
}
// TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
discard.discard();
})?;
discard.check_idle(status);
return Err(err);
}
};
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
}
Err(err) => {
info!("rollback");
let status = transaction.rollback().await.inspect_err(|_| {
// if we cannot rollback - for now don't return connection to pool
// TODO: get a query status from the error
discard.discard();
})?;
discard.check_idle(status);
return Err(err);
}
};
Ok(json_output)
}
}
async fn query_batch(
config: &'static ProxyConfig,
cancel: CancellationToken,
transaction: &Transaction<'_>,
queries: BatchQueryData,
@@ -811,7 +798,6 @@ async fn query_batch(
let mut current_size = 0;
for stmt in queries.queries {
let query = pin!(query_to_json(
config,
transaction,
stmt,
&mut current_size,
@@ -840,7 +826,6 @@ async fn query_batch(
}
async fn query_to_json<T: GenericClient>(
config: &'static ProxyConfig,
client: &T,
data: QueryData,
current_size: &mut usize,
@@ -861,10 +846,8 @@ async fn query_to_json<T: GenericClient>(
rows.push(row);
// we don't have a streaming response support yet so this is to prevent OOM
// from a malicious query (eg a cross join)
if *current_size > config.http_config.max_response_size_bytes {
return Err(SqlOverHttpError::ResponseTooLarge(
config.http_config.max_response_size_bytes,
));
if *current_size > MAX_RESPONSE_SIZE {
return Err(SqlOverHttpError::ResponseTooLarge);
}
}

View File

@@ -13,12 +13,14 @@ testing = ["fail/failpoints"]
[dependencies]
async-stream.workspace = true
anyhow.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
camino.workspace = true
camino-tempfile.workspace = true
chrono.workspace = true
clap = { workspace = true, features = ["derive"] }
const_format.workspace = true
crc32c.workspace = true
fail.workspace = true
git-version.workspace = true
@@ -36,6 +38,8 @@ scopeguard.workspace = true
reqwest = { workspace = true, features = ["json"] }
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true
signal-hook.workspace = true
strum.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
@@ -44,6 +48,7 @@ tokio-util = { workspace = true }
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
tokio-tar.workspace = true
toml_edit.workspace = true
tracing.workspace = true
url.workspace = true
metrics.workspace = true

View File

@@ -17,7 +17,6 @@ use postgres_ffi::MAX_SEND_SIZE;
use serde::Deserialize;
use serde::Serialize;
use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName};
use sha2::{Digest, Sha256};
use utils::id::NodeId;
use utils::id::TenantTimelineId;
@@ -52,9 +51,6 @@ pub struct Args {
/// Dump full term history. True by default.
pub dump_term_history: bool,
/// Dump last modified time of WAL segments. Uses value of `dump_all` by default.
pub dump_wal_last_modified: bool,
/// Filter timelines by tenant_id.
pub tenant_id: Option<TenantId>,
@@ -132,19 +128,12 @@ async fn build_from_tli_dump(
None
};
let wal_last_modified = if args.dump_wal_last_modified {
get_wal_last_modified(timeline_dir).ok().flatten()
} else {
None
};
Timeline {
tenant_id: timeline.ttid.tenant_id,
timeline_id: timeline.ttid.timeline_id,
control_file,
memory,
disk_content,
wal_last_modified,
}
}
@@ -167,7 +156,6 @@ pub struct Timeline {
pub control_file: Option<TimelinePersistentState>,
pub memory: Option<Memory>,
pub disk_content: Option<DiskContent>,
pub wal_last_modified: Option<DateTime<Utc>>,
}
#[derive(Debug, Serialize, Deserialize)]
@@ -314,27 +302,6 @@ fn build_file_info(entry: DirEntry) -> Result<FileInfo> {
})
}
/// Get highest modified time of WAL segments in the directory.
fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
let mut res = None;
for entry in fs::read_dir(path)? {
if entry.is_err() {
continue;
}
let entry = entry?;
/* Ignore files that are not XLOG segments */
let fname = entry.file_name();
if !IsXLogFileName(&fname) && !IsPartialXLogFileName(&fname) {
continue;
}
let metadata = entry.metadata()?;
let modified: DateTime<Utc> = DateTime::from(metadata.modified()?);
res = std::cmp::max(res, Some(modified));
}
Ok(res)
}
/// Converts SafeKeeperConf to Config, filtering out the fields that are not
/// supposed to be exposed.
fn build_config(config: SafeKeeperConf) -> Config {

View File

@@ -1,11 +1,7 @@
openapi: "3.0.2"
info:
title: Safekeeper control API
description: Neon Safekeeper API
version: "1.0"
license:
name: "Apache"
url: https://github.com/neondatabase/neon/blob/main/LICENSE
servers:
@@ -390,12 +386,6 @@ components:
msg:
type: string
NotFoundError:
type: object
properties:
msg:
type: string
responses:
#

View File

@@ -481,7 +481,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
let mut dump_memory: Option<bool> = None;
let mut dump_disk_content: Option<bool> = None;
let mut dump_term_history: Option<bool> = None;
let mut dump_wal_last_modified: Option<bool> = None;
let mut tenant_id: Option<TenantId> = None;
let mut timeline_id: Option<TimelineId> = None;
@@ -495,7 +494,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
"dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?),
"dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?),
"dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?),
"dump_wal_last_modified" => dump_wal_last_modified = Some(parse_kv_str(&k, &v)?),
"tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?),
"timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?),
_ => Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -510,7 +508,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
let dump_memory = dump_memory.unwrap_or(dump_all);
let dump_disk_content = dump_disk_content.unwrap_or(dump_all);
let dump_term_history = dump_term_history.unwrap_or(true);
let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
let args = debug_dump::Args {
dump_all,
@@ -518,7 +515,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
dump_memory,
dump_disk_content,
dump_term_history,
dump_wal_last_modified,
tenant_id,
timeline_id,
};

View File

@@ -278,7 +278,7 @@ impl WalResidentTimeline {
}
/// pull_timeline request body.
#[derive(Debug, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct Request {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
@@ -293,7 +293,7 @@ pub struct Response {
}
/// Response for debug dump request.
#[derive(Debug, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct DebugDumpResponse {
pub start_time: DateTime<Utc>,
pub finish_time: DateTime<Utc>,

View File

@@ -539,17 +539,20 @@ async fn remove_segments_from_disk(
while let Some(entry) = entries.next_entry().await? {
let entry_path = entry.path();
let fname = entry_path.file_name().unwrap();
/* Ignore files that are not XLOG segments */
if !IsXLogFileName(fname) && !IsPartialXLogFileName(fname) {
continue;
}
let (segno, _) = XLogFromFileName(fname, wal_seg_size)?;
if remove_predicate(segno) {
remove_file(entry_path).await?;
n_removed += 1;
min_removed = min(min_removed, segno);
max_removed = max(max_removed, segno);
REMOVED_WAL_SEGMENTS.inc();
if let Some(fname_str) = fname.to_str() {
/* Ignore files that are not XLOG segments */
if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) {
continue;
}
let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
if remove_predicate(segno) {
remove_file(entry_path).await?;
n_removed += 1;
min_removed = min(min_removed, segno);
max_removed = max(max_removed, segno);
REMOVED_WAL_SEGMENTS.inc();
}
}
}

View File

@@ -10,6 +10,7 @@ bench = []
[dependencies]
anyhow.workspace = true
async-stream.workspace = true
bytes.workspace = true
clap = { workspace = true, features = ["derive"] }
const_format.workspace = true
futures.workspace = true
@@ -23,6 +24,7 @@ parking_lot.workspace = true
prost.workspace = true
tonic.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread"] }
tokio-stream.workspace = true
tracing.workspace = true
metrics.workspace = true
utils.workspace = true

View File

@@ -15,7 +15,9 @@ testing = []
[dependencies]
anyhow.workspace = true
aws-config.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
clap.workspace = true
fail.workspace = true

View File

@@ -5,7 +5,18 @@ edition.workspace = true
license.workspace = true
[dependencies]
pageserver_api.workspace = true
pageserver_client.workspace = true
thiserror.workspace = true
reqwest.workspace = true
utils.workspace = true
serde.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio.workspace = true
futures.workspace = true
tokio-util.workspace = true
anyhow.workspace = true
postgres.workspace = true
bytes.workspace = true

View File

@@ -71,37 +71,6 @@ impl ComputeHookTenant {
}
}
fn is_sharded(&self) -> bool {
matches!(self, ComputeHookTenant::Sharded(_))
}
/// Clear compute hook state for the specified shard.
/// Only valid for [`ComputeHookTenant::Sharded`] instances.
fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) {
match self {
ComputeHookTenant::Sharded(sharded) => {
if sharded.stripe_size != stripe_size
|| sharded.shard_count != tenant_shard_id.shard_count
{
tracing::warn!("Shard split detected while handling detach")
}
let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| {
*shard_number == tenant_shard_id.shard_number
});
if let Some(shard_idx) = shard_idx {
sharded.shards.remove(shard_idx);
} else {
tracing::warn!("Shard not found while handling detach")
}
}
ComputeHookTenant::Unsharded(_) => {
unreachable!("Detach of unsharded tenants is handled externally");
}
}
}
/// Set one shard's location. If stripe size or shard count have changed, Self is reset
/// and drops existing content.
fn update(
@@ -645,36 +614,6 @@ impl ComputeHook {
self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
.await
}
/// Reflect a detach for a particular shard in the compute hook state.
///
/// The goal is to avoid sending compute notifications with stale information (i.e.
/// including detach pageservers).
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(super) fn handle_detach(
&self,
tenant_shard_id: TenantShardId,
stripe_size: ShardStripeSize,
) {
use std::collections::hash_map::Entry;
let mut state_locked = self.state.lock().unwrap();
match state_locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(_) => {
tracing::warn!("Compute hook tenant not found for detach");
}
Entry::Occupied(mut e) => {
let sharded = e.get().is_sharded();
if !sharded {
e.remove();
} else {
e.get_mut().remove_shard(tenant_shard_id, stripe_size);
}
tracing::debug!("Compute hook handled shard detach");
}
}
}
}
#[cfg(test)]

View File

@@ -1849,7 +1849,7 @@ pub fn make_router(
RequestName("v1_tenant_timeline"),
)
})
.put(
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
|r| {
tenant_service_handler(

View File

@@ -238,7 +238,7 @@ impl PageserverClient {
) -> Result<()> {
measured_request!(
"timeline_archival_config",
crate::metrics::Method::Put,
crate::metrics::Method::Post,
&self.node_id_label,
self.inner
.timeline_archival_config(tenant_shard_id, timeline_id, req)

View File

@@ -820,16 +820,6 @@ impl Reconciler {
self.location_config(&node, conf, None, false).await?;
}
// The condition below identifies a detach. We must have no attached intent and
// must have been attached to something previously. Pass this information to
// the [`ComputeHook`] such that it can update its tenant-wide state.
if self.intent.attached.is_none() && !self.detach.is_empty() {
// TODO: Consider notifying control plane about detaches. This would avoid situations
// where the compute tries to start-up with a stale set of pageservers.
self.compute_hook
.handle_detach(self.tenant_shard_id, self.shard.stripe_size);
}
failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
Ok(())

Some files were not shown because too many files have changed in this diff Show More