Compare commits

..

3 Commits

Author SHA1 Message Date
Konstantin Knizhnik
798e316cb5 Bump Postgres version 2025-02-01 08:19:22 +02:00
Konstantin Knizhnik
4bf6d4c378 Bump Postgres version 2025-02-01 08:19:22 +02:00
Konstantin Knizhnik
61c4575b1e Add support of dedicated backends 2025-02-01 08:18:16 +02:00
120 changed files with 2046 additions and 5089 deletions

View File

@@ -24,4 +24,3 @@
!storage_controller/
!vendor/postgres-*/
!workspace_hack/
!build_tools/patches

View File

@@ -41,10 +41,7 @@ inputs:
description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
required: false
default: '/tmp/neon/pg_install/v16/lib'
project_settings:
description: 'A JSON object with project settings'
required: false
default: '{}'
outputs:
dsn:
@@ -76,7 +73,7 @@ runs:
\"provisioner\": \"k8s-neonvm\",
\"autoscaling_limit_min_cu\": ${MIN_CU},
\"autoscaling_limit_max_cu\": ${MAX_CU},
\"settings\": ${PROJECT_SETTINGS}
\"settings\": { }
}
}")
@@ -95,12 +92,12 @@ runs:
if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
# determine tenant ID
TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
# we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
curl -X PUT \
"https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
@@ -121,4 +118,3 @@ runs:
STRIPE_SIZE: ${{ inputs.stripe_size }}
PSQL: ${{ inputs.psql_path }}
LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
PROJECT_SETTINGS: ${{ inputs.project_settings }}

View File

@@ -121,8 +121,6 @@ runs:
export DEFAULT_PG_VERSION=${PG_VERSION#v}
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1
export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1
if [ "${BUILD_TYPE}" = "remote" ]; then
export REMOTE_ENV=1

View File

@@ -20,7 +20,7 @@ on:
required: true
type: string
test-cfg:
description: 'a json object of postgres versions and lfc/sanitizers states to build and run regression tests on'
description: 'a json object of postgres versions and lfc states to run regression tests on'
required: true
type: string
@@ -48,8 +48,6 @@ jobs:
# io_uring will account the memory of the CQ and SQ as locked.
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
env:
BUILD_TYPE: ${{ inputs.build-type }}
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -89,7 +87,6 @@ jobs:
- name: Set env variables
env:
ARCH: ${{ inputs.arch }}
SANITIZERS: ${{ matrix.sanitizers }}
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
@@ -102,14 +99,8 @@ jobs:
cov_prefix=""
CARGO_FLAGS="--locked --release"
fi
if [[ $SANITIZERS == 'enabled' ]]; then
make_vars="WITH_SANITIZERS=yes"
else
make_vars=""
fi
{
echo "cov_prefix=${cov_prefix}"
echo "make_vars=${make_vars}"
echo "CARGO_FEATURES=${CARGO_FEATURES}"
echo "CARGO_FLAGS=${CARGO_FLAGS}"
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
@@ -145,39 +136,37 @@ jobs:
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make ${make_vars} postgres-v14 -j$(nproc)
run: mold -run make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make ${make_vars} postgres-v15 -j$(nproc)
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make ${make_vars} postgres-v16 -j$(nproc)
run: mold -run make postgres-v16 -j$(nproc)
- name: Build postgres v17
if: steps.cache_pg_17.outputs.cache-hit != 'true'
run: mold -run make ${make_vars} postgres-v17 -j$(nproc)
run: mold -run make postgres-v17 -j$(nproc)
- name: Build neon extensions
run: mold -run make ${make_vars} neon-pg-ext -j$(nproc)
run: mold -run make neon-pg-ext -j$(nproc)
- name: Build walproposer-lib
run: mold -run make ${make_vars} walproposer-lib -j$(nproc)
run: mold -run make walproposer-lib -j$(nproc)
- name: Run cargo build
env:
WITH_TESTS: ${{ matrix.sanitizers != 'enabled' && '--tests' || '' }}
run: |
export ASAN_OPTIONS=detect_leaks=0
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
# Do install *before* running rust tests because they might recompile the
# binaries with different features/flags.
- name: Install rust binaries
env:
ARCH: ${{ inputs.arch }}
SANITIZERS: ${{ matrix.sanitizers }}
run: |
# Install target binaries
mkdir -p /tmp/neon/bin/
@@ -192,7 +181,7 @@ jobs:
done
# Install test executables and write list of all binaries (for code coverage)
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
@@ -225,10 +214,11 @@ jobs:
role-duration-seconds: 18000 # 5 hours
- name: Run rust tests
if: ${{ matrix.sanitizers != 'enabled' }}
env:
NEXTEST_RETRIES: 3
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
export LD_LIBRARY_PATH
@@ -287,7 +277,6 @@ jobs:
DATABASE_URL: postgresql://localhost:1235/storage_controller
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
run: |
export ASAN_OPTIONS=detect_leaks=0
/tmp/neon/bin/neon_local init
/tmp/neon/bin/neon_local storage_controller start
@@ -334,7 +323,7 @@ jobs:
- name: Pytest regression tests
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
uses: ./.github/actions/run-python-test-set
timeout-minutes: ${{ matrix.sanitizers != 'enabled' && 60 || 180 }}
timeout-minutes: 60
with:
build_type: ${{ inputs.build-type }}
test_selection: regress
@@ -352,7 +341,6 @@ jobs:
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
SANITIZERS: ${{ matrix.sanitizers }}
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540

View File

@@ -67,9 +67,9 @@ jobs:
- uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: main
token: ${{ secrets.CI_ACCESS_TOKEN }}
- name: Look for existing PR
id: get-pr
env:
@@ -77,7 +77,7 @@ jobs:
run: |
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
- name: Get changed labels
id: get-labels
if: steps.get-pr.outputs.ALREADY_CREATED != ''
@@ -94,6 +94,8 @@ jobs:
echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
- run: gh pr checkout "${PR_NUMBER}"
- run: git checkout -b "${BRANCH}"
- run: git push --force origin "${BRANCH}"
@@ -101,7 +103,7 @@ jobs:
- name: Create a Pull Request for CI run (if required)
if: steps.get-pr.outputs.ALREADY_CREATED == ''
env:
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
@@ -138,7 +140,7 @@ jobs:
- run: git push --force origin "${BRANCH}"
if: steps.get-pr.outputs.ALREADY_CREATED != ''
cleanup:
# Close PRs and delete branchs if the original PR is closed.

View File

@@ -11,7 +11,8 @@ on:
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 3 * * *' # run once a day, timezone is utc
# - cron: '0 3 * * *' # run once a day, timezone is utc
- cron: '0 */10 * * *' # Runs every 10 hours at minute 0
workflow_dispatch: # adds ability to run this manually
inputs:
region_id:
@@ -340,7 +341,7 @@ jobs:
],
"pg_version" : [
16,17
]
],
}'
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
@@ -550,6 +551,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
pgbench-pgvector:
if: false
permissions:
contents: write
statuses: write
@@ -683,7 +685,8 @@ jobs:
#
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
# if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: false
permissions:
contents: write
statuses: write
@@ -811,6 +814,7 @@ jobs:
#
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
# if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: false
permissions:
contents: write
statuses: write
@@ -930,6 +934,7 @@ jobs:
user-examples-compare:
# if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
if: false
permissions:
contents: write
statuses: write

View File

@@ -235,7 +235,7 @@ jobs:
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Run cargo build (only for v17)
run: cargo build --all --release -j$(sysctl -n hw.ncpu)
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
- name: Check that no warnings are produced (only for v17)
run: ./run_clippy.sh

View File

@@ -682,7 +682,7 @@ jobs:
push: true
pull: true
file: compute/compute-node.Dockerfile
target: extension-tests
target: neon-pg-ext-test
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
tags: |
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}

View File

@@ -1,133 +0,0 @@
name: Build and Test with Sanitizers
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 1 * * *' # run once a day, timezone is utc
workflow_dispatch:
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
jobs:
tag:
runs-on: [ self-hosted, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
# Need `fetch-depth: 0` to count the number of commits in the branch
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get build tag
run: |
echo run:$GITHUB_RUN_ID
echo ref:$GITHUB_REF_NAME
echo rev:$(git rev-list --count HEAD)
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
fi
shell: bash
id: build-tag
build-build-tools-image:
uses: ./.github/workflows/build-build-tools-image.yml
secrets: inherit
build-and-test-locally:
needs: [ tag, build-build-tools-image ]
strategy:
fail-fast: false
matrix:
arch: [ x64, arm64 ]
build-type: [ release ]
uses: ./.github/workflows/_build-and-test-locally.yml
with:
arch: ${{ matrix.arch }}
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
test-cfg: '[{"pg_version":"v17", "sanitizers": "enabled"}]'
secrets: inherit
create-test-report:
needs: [ build-and-test-locally, build-build-tools-image ]
if: ${{ !cancelled() }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: write
pull-requests: write
outputs:
report-url: ${{ steps.create-allure-report.outputs.report-url }}
runs-on: [ self-hosted, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- name: Create Allure report
if: ${{ !cancelled() }}
id: create-allure-report
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- uses: actions/github-script@v7
if: ${{ !cancelled() }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const report = {
reportUrl: "${{ steps.create-allure-report.outputs.report-url }}",
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
}
const coverage = {}
const script = require("./scripts/comment-test-report.js")
await script({
github,
context,
fetch,
report,
coverage,
})

View File

@@ -114,7 +114,7 @@ jobs:
run: make walproposer-lib -j$(nproc)
- name: Produce the build stats
run: cargo build --all --release --timings -j$(nproc)
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4

View File

@@ -12,8 +12,8 @@ on:
pull_request:
paths:
- '.github/workflows/pg-clients.yml'
- 'test_runner/pg_clients/**/*.py'
- 'test_runner/logical_repl/**/*.py'
- 'test_runner/pg_clients/**'
- 'test_runner/logical_repl/**'
- 'poetry.lock'
workflow_dispatch:
@@ -104,8 +104,6 @@ jobs:
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
project_settings: >-
{"enable_logical_replication": true}
- name: Run tests
uses: ./.github/actions/run-python-test-set

View File

@@ -95,8 +95,7 @@ jobs:
# - conclusion
# - neon-cloud-e2e
conclusion:
# Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow
if: always() && github.event_name == 'merge_group'
if: always()
permissions:
statuses: write # for `github.repos.createCommitStatus(...)`
contents: write

202
Cargo.lock generated
View File

@@ -206,16 +206,6 @@ dependencies = [
"syn 2.0.90",
]
[[package]]
name = "assert-json-diff"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "async-channel"
version = "1.9.0"
@@ -300,9 +290,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "aws-config"
version = "1.5.10"
version = "1.5.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -311,7 +301,7 @@ dependencies = [
"aws-sdk-sts",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.60.7",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -342,9 +332,9 @@ dependencies = [
[[package]]
name = "aws-runtime"
version = "1.4.4"
version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
dependencies = [
"aws-credential-types",
"aws-sigv4",
@@ -376,7 +366,7 @@ dependencies = [
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-query",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
@@ -399,7 +389,7 @@ dependencies = [
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -424,7 +414,7 @@ dependencies = [
"aws-smithy-checksums",
"aws-smithy-eventstream",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -447,15 +437,15 @@ dependencies = [
[[package]]
name = "aws-sdk-sso"
version = "1.50.0"
version = "1.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -469,15 +459,15 @@ dependencies = [
[[package]]
name = "aws-sdk-ssooidc"
version = "1.51.0"
version = "1.58.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -491,15 +481,15 @@ dependencies = [
[[package]]
name = "aws-sdk-sts"
version = "1.51.0"
version = "1.58.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-query",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
@@ -514,9 +504,9 @@ dependencies = [
[[package]]
name = "aws-sigv4"
version = "1.2.6"
version = "1.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
dependencies = [
"aws-credential-types",
"aws-smithy-eventstream",
@@ -543,9 +533,9 @@ dependencies = [
[[package]]
name = "aws-smithy-async"
version = "1.2.1"
version = "1.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
dependencies = [
"futures-util",
"pin-project-lite",
@@ -575,9 +565,9 @@ dependencies = [
[[package]]
name = "aws-smithy-eventstream"
version = "0.60.5"
version = "0.60.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
dependencies = [
"aws-smithy-types",
"bytes",
@@ -586,9 +576,9 @@ dependencies = [
[[package]]
name = "aws-smithy-http"
version = "0.60.11"
version = "0.60.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
dependencies = [
"aws-smithy-eventstream",
"aws-smithy-runtime-api",
@@ -607,18 +597,9 @@ dependencies = [
[[package]]
name = "aws-smithy-json"
version = "0.60.7"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
dependencies = [
"aws-smithy-types",
]
[[package]]
name = "aws-smithy-json"
version = "0.61.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
dependencies = [
"aws-smithy-types",
]
@@ -635,9 +616,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.7.4"
version = "1.7.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -679,9 +660,9 @@ dependencies = [
[[package]]
name = "aws-smithy-types"
version = "1.2.9"
version = "1.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
dependencies = [
"base64-simd",
"bytes",
@@ -714,9 +695,9 @@ dependencies = [
[[package]]
name = "aws-types"
version = "1.3.3"
version = "1.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
dependencies = [
"aws-credential-types",
"aws-smithy-async",
@@ -951,18 +932,6 @@ version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "bb8"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
dependencies = [
"async-trait",
"futures-util",
"parking_lot 0.12.1",
"tokio",
]
[[package]]
name = "bcder"
version = "0.7.4"
@@ -1029,12 +998,6 @@ dependencies = [
"generic-array",
]
[[package]]
name = "boxcar"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
[[package]]
name = "bstr"
version = "1.5.0"
@@ -1827,24 +1790,11 @@ dependencies = [
"chrono",
"diesel_derives",
"itoa",
"pq-sys",
"r2d2",
"serde_json",
]
[[package]]
name = "diesel-async"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
dependencies = [
"async-trait",
"bb8",
"diesel",
"futures-util",
"scoped-futures",
"tokio",
"tokio-postgres",
]
[[package]]
name = "diesel_derives"
version = "2.2.1"
@@ -2458,16 +2408,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "gettid"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "gimli"
version = "0.31.1"
@@ -4247,16 +4187,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "papaya"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
dependencies = [
"equivalent",
"seize",
]
[[package]]
name = "parking"
version = "2.1.1"
@@ -4715,6 +4645,15 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "pq-sys"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
dependencies = [
"vcpkg",
]
[[package]]
name = "pq_proto"
version = "0.1.0"
@@ -4884,7 +4823,6 @@ dependencies = [
"ahash",
"anyhow",
"arc-swap",
"assert-json-diff",
"async-compression",
"async-trait",
"atomic-take",
@@ -4892,7 +4830,6 @@ dependencies = [
"aws-sdk-iam",
"aws-sigv4",
"base64 0.13.1",
"boxcar",
"bstr",
"bytes",
"camino",
@@ -4909,7 +4846,6 @@ dependencies = [
"flate2",
"framed-websockets",
"futures",
"gettid",
"hashbrown 0.14.5",
"hashlink",
"hex",
@@ -4932,9 +4868,7 @@ dependencies = [
"measured",
"metrics",
"once_cell",
"opentelemetry",
"p256 0.13.2",
"papaya",
"parking_lot 0.12.1",
"parquet",
"parquet_derive",
@@ -4981,9 +4915,6 @@ dependencies = [
"tokio-tungstenite 0.21.0",
"tokio-util",
"tracing",
"tracing-log",
"tracing-opentelemetry",
"tracing-serde",
"tracing-subscriber",
"tracing-utils",
"try-lock",
@@ -5035,6 +4966,17 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "r2d2"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
dependencies = [
"log",
"parking_lot 0.12.1",
"scheduled-thread-pool",
]
[[package]]
name = "rand"
version = "0.7.3"
@@ -5855,12 +5797,12 @@ dependencies = [
]
[[package]]
name = "scoped-futures"
version = "0.1.4"
name = "scheduled-thread-pool"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
dependencies = [
"pin-project-lite",
"parking_lot 0.12.1",
]
[[package]]
@@ -5937,16 +5879,6 @@ dependencies = [
"libc",
]
[[package]]
name = "seize"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "semver"
version = "1.0.17"
@@ -6405,7 +6337,6 @@ dependencies = [
"clap",
"control_plane",
"diesel",
"diesel-async",
"diesel_migrations",
"fail",
"futures",
@@ -6420,12 +6351,10 @@ dependencies = [
"pageserver_api",
"pageserver_client",
"postgres_connection",
"r2d2",
"rand 0.8.5",
"reqwest",
"routerify",
"rustls 0.23.18",
"rustls-native-certs 0.8.0",
"scoped-futures",
"scopeguard",
"serde",
"serde_json",
@@ -6433,8 +6362,6 @@ dependencies = [
"strum_macros",
"thiserror 1.0.69",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-util",
"tracing",
"utils",
@@ -6677,7 +6604,7 @@ dependencies = [
"fastrand 2.2.0",
"once_cell",
"rustix",
"windows-sys 0.59.0",
"windows-sys 0.52.0",
]
[[package]]
@@ -7635,6 +7562,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
@@ -8208,7 +8141,6 @@ dependencies = [
"tower 0.4.13",
"tracing",
"tracing-core",
"tracing-log",
"url",
"zerocopy",
"zeroize",

View File

@@ -54,7 +54,6 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
backtrace = "0.3.74"
flate2 = "1.0.26"
assert-json-diff = "2"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
@@ -194,9 +193,7 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
tower-service = "0.3.3"
tracing = "0.1"
tracing-error = "0.2"
tracing-log = "0.2"
tracing-opentelemetry = "0.28"
tracing-serde = "0.2.0"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
try-lock = "0.2.5"
twox-hash = { version = "1.6.3", default-features = false }

View File

@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
ARG ADDITIONAL_RUSTFLAGS
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
&& PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \

View File

@@ -10,29 +10,18 @@ ICU_PREFIX_DIR := /usr/local/icu
# environment variable.
#
BUILD_TYPE ?= debug
WITH_SANITIZERS ?= no
ifeq ($(BUILD_TYPE),release)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
PG_CFLAGS = -O2 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
else ifeq ($(BUILD_TYPE),debug)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
PG_CFLAGS = -O0 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
endif
ifeq ($(WITH_SANITIZERS),yes)
PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover
COPT += -Wno-error # to avoid failing on warnings induced by sanitizers
PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS)
export CC := gcc
export ASAN_OPTIONS := detect_leaks=0
endif
ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
# Exclude static build openssl, icu for local build (MacOS, Linux)
# Only keep for build type release and debug
@@ -44,9 +33,7 @@ endif
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux
ifneq ($(WITH_SANITIZERS),yes)
PG_CONFIGURE_OPTS += --with-libseccomp
endif
PG_CONFIGURE_OPTS += --with-libseccomp
else ifeq ($(UNAME_S),Darwin)
PG_CFLAGS += -DUSE_PREFETCH
ifndef DISABLE_HOMEBREW
@@ -77,6 +64,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
# Force cargo not to print progress bar
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
@@ -119,7 +108,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)

View File

@@ -3,17 +3,10 @@ ARG DEBIAN_VERSION=bookworm
FROM debian:bookworm-slim AS pgcopydb_builder
ARG DEBIAN_VERSION
# Use strict mode for bash to catch errors early
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
# By default, /bin/sh used in debian images will treat '\n' as eol,
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
set -e && \
apt update && \
@@ -46,7 +39,6 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
mkdir /tmp/pgcopydb && \
tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
cd /tmp/pgcopydb && \
patch -p1 < /pgcopydbv017.patch && \
make -s clean && \
make -s -j12 install && \
libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
@@ -63,8 +55,7 @@ ARG DEBIAN_VERSION
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
# Use strict mode for bash to catch errors early
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
SHELL ["/bin/bash", "-c"]
RUN mkdir -p /pgcopydb/bin && \
mkdir -p /pgcopydb/lib && \
@@ -75,7 +66,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
# System deps
@@ -199,14 +190,8 @@ RUN set -e \
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
# And patches from us:
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
RUN set +o pipefail && \
for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
done && \
set -o pipefail
# Split into separate step to debug flaky failures here
RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
&& ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
&& cd lcov \

View File

@@ -1,57 +0,0 @@
diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
index d730b03..69a9be9 100644
--- a/src/bin/pgcopydb/copydb.c
+++ b/src/bin/pgcopydb/copydb.c
@@ -44,6 +44,7 @@ GUC dstSettings[] = {
{ "synchronous_commit", "'off'" },
{ "statement_timeout", "0" },
{ "lock_timeout", "0" },
+ { "idle_in_transaction_session_timeout", "0" },
{ NULL, NULL },
};
diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
index 94f2f46..e051ba8 100644
--- a/src/bin/pgcopydb/pgsql.c
+++ b/src/bin/pgcopydb/pgsql.c
@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
LinesBuffer lbuf = { 0 };
+ if (message != NULL){
+ // make sure message is writable by splitLines
+ message = strdup(message);
+ }
+
if (!splitLines(&lbuf, message))
{
/* errors have already been logged */
@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
PQbackendPID(pgsql->connection),
lbuf.lines[lineNumber]);
}
+ free(message); // free copy of message we created above
if (pgsql->logSQL)
{
@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
/* errors have already been logged */
return;
}
-
if (res != NULL)
{
char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
- strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+ if (sqlstate == NULL)
+ {
+ // PQresultErrorField returned NULL!
+ pgsql->sqlstate[0] = '\0'; // Set to an empty string to avoid segfault
+ }
+ else
+ {
+ strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+ }
}
char *endpoint =

File diff suppressed because it is too large Load Diff

View File

@@ -47,9 +47,7 @@ files:
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes

View File

@@ -231,14 +231,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
])
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()

View File

@@ -261,13 +261,7 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);
// Pass through these environment variables to the command
for var in [
"LLVM_PROFILE_FILE",
"FAILPOINTS",
"RUST_LOG",
"ASAN_OPTIONS",
"UBSAN_OPTIONS",
] {
for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
if let Some(val) = std::env::var_os(var) {
filled_cmd = filled_cmd.env(var, val);
}

View File

@@ -388,11 +388,6 @@ impl PageServerNode {
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
image_creation_preempt_threshold: settings
.remove("image_creation_preempt_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")

View File

@@ -221,17 +221,7 @@ impl StorageController {
"-p",
&format!("{}", postgres_port),
];
let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
let envs = [
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
];
let exitcode = Command::new(bin_path)
.args(args)
.envs(envs)
.spawn()?
.wait()
.await?;
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
Ok(exitcode.success())
}
@@ -252,11 +242,6 @@ impl StorageController {
let pg_bin_dir = self.get_pg_bin_dir().await?;
let createdb_path = pg_bin_dir.join("createdb");
let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
let envs = [
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
];
let output = Command::new(&createdb_path)
.args([
"-h",
@@ -269,7 +254,6 @@ impl StorageController {
&username(),
DB_NAME,
])
.envs(envs)
.output()
.await
.expect("Failed to spawn createdb");

View File

@@ -32,7 +32,6 @@ reason = "the marvin attack only affects private key decryption, not public key
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
[licenses]
allow = [
"0BSD",
"Apache-2.0",
"BSD-2-Clause",
"BSD-3-Clause",

View File

@@ -52,7 +52,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
if [ $pg_version -ge 16 ]; then
docker cp ext-src $TEST_CONTAINER_NAME:/
docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
echo Adding dummy config

View File

@@ -1,4 +0,0 @@
#!/bin/bash
set -ex
cd "$(dirname "${0}")"
pg_prove test.sql

View File

@@ -1,15 +0,0 @@
diff --git a/test.sql b/test.sql
index d7a0ca8..f15bc76 100644
--- a/test.sql
+++ b/test.sql
@@ -9,9 +9,7 @@
\set ON_ERROR_STOP true
\set QUIET 1
-CREATE EXTENSION pgcrypto;
-CREATE EXTENSION pgtap;
-CREATE EXTENSION pgjwt;
+CREATE EXTENSION IF NOT EXISTS pgtap;
BEGIN;
SELECT plan(23);

View File

@@ -1,5 +0,0 @@
#!/bin/sh
set -ex
cd "$(dirname ${0})"
patch -p1 <test-upgrade.patch
pg_prove -d contrib_regression test.sql

View File

@@ -24,7 +24,7 @@ function wait_for_ready {
}
function create_extensions() {
for ext in ${1}; do
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
done
}
EXTENSIONS='[
@@ -40,8 +40,7 @@ EXTENSIONS='[
{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
{"extname": "semver", "extdir": "pg_semver-src"},
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
{"extname": "pgjwt", "extdir": "pgjwt-src"}
{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

View File

@@ -285,10 +285,10 @@ To summarize, list of cplane changes:
### storage_controller implementation
If desired, we may continue using current 'load everything on startup and keep
in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16
byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids
plus some flags), so 10^6 of timelines shouldn't take more than 100MB.
Current 'load everything on startup and keep in memory' easy design is fine.
Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
10^6 of timelines shouldn't take more than 100MB.
Similar to pageserver attachment Intents storage_controller would have in-memory
`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
@@ -296,7 +296,7 @@ to make these request reality; this ensures one instance of storage_controller
won't do several migrations on the same timeline concurrently. In the first
version it is simpler to have more manual control and no retries, i.e. migration
failure removes the request. Later we can build retries and automatic
scheduling/migration around. `MigrationRequest` is
scheduling/migration. `MigrationRequest` is
```
enum MigrationRequest {
To(Vec<NodeId>),
@@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually).
#### Schema
`safekeepers` table mirroring current `nodes` should be added, except that for
`scheduling_policy`: it is enough to have at least in the beginning only 3
fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
3) `decomissioned` (node is removed).
`scheduling_policy` field (seems like `status` is a better name for it): it is enough
to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
`decomissioned`.
`timelines` table:
```
@@ -324,24 +324,18 @@ table! {
timelines (tenant_id, timeline_id) {
timeline_id -> Varchar,
tenant_id -> Varchar,
start_lsn -> pg_lsn,
generation -> Int4,
sk_set -> Array<Int4>, // list of safekeeper ids
new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
cplane_notified_generation -> Int4,
deleted_at -> Nullable<Timestamptz>,
}
}
```
`start_lsn` is needed to create timeline on safekeepers properly, see below. We
might also want to add ancestor_timeline_id to preserve the hierarchy, but for
this RFC it is not needed.
#### API
Node management is similar to pageserver:
1) POST `/control/v1/safekeepers` inserts safekeeper.
1) POST `/control/v1/safekeepers` upserts safekeeper.
2) GET `/control/v1/safekeepers` lists safekeepers.
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
@@ -351,15 +345,25 @@ Node management is similar to pageserver:
Safekeeper deploy scripts should register safekeeper at storage_contorller as
they currently do with cplane, under the same id.
Timeline creation/deletion will work through already existing POST and DELETE
`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they
succeed. See next section on the implementation details.
Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
would 1) choose initial set of safekeepers; 2) write to the db initial
`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
case of conflict; 3) create timeline on the majority of safekeepers (already
created is ok).
We don't want to block timeline creation/deletion when one safekeeper is down.
Currently this is crutched by compute implicitly creating timeline on any
safekeeper it is connected to. This creates ugly timeline state on safekeeper
when timeline is created, but start LSN is not defined yet. Next section
describes dealing with this.
We don't want to block timeline creation when one safekeeper is down. Currently
this is solved by compute implicitly creating timeline on any safekeeper it is
connected to. This creates ugly timeline state on safekeeper when timeline is
created, but start LSN is not defined yet. It would be nice to remove this; to
do that, controller can in the background retry to create timeline on
safekeeper(s) which missed that during initial creation call. It can do that
through `pull_timeline` from majority so it doesn't need to remember
`parent_lsn` in its db.
Timeline deletion removes the row from the db and forwards deletion to the
current configuration members. Without additional actions deletions might leak,
see below on this; initially let's ignore these, reporting to cplane success if
at least one safekeeper deleted the timeline (this will remove s3 data).
Tenant deletion repeats timeline deletion for all timelines.
@@ -391,6 +395,26 @@ Similar call should be added for the tenant.
It would be great to have some way of subscribing to the results (apart from
looking at logs/metrics).
Migration is executed as described above. One subtlety is that (local) deletion on
source safekeeper might fail, which is not a problem if we are going to
decomission the node but leaves garbage otherwise. I'd propose in the first version
1) Don't attempt deletion at all if node status is `offline`.
2) If it failed, just issue warning.
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
remove garbage timelines for manual use. It will 1) list all timelines on the
safekeeper 2) compare each one against configuration storage: if timeline
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
be deleted under generation number if node is not member of current generation.
Automating this is untrivial; we'd need to register all potential missing
deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
which switches configurations. Similarly when timeline is fully deleted to
prevent cplane operation from blocking when some safekeeper is not available
deletion should be also registered.
One more task pool should infinitely retry notifying control plane about changed
safekeeper sets.
3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
current in memory state of the timeline and pending `MigrationRequest`,
if any.
@@ -399,153 +423,12 @@ looking at logs/metrics).
migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
(incrementing generation as always).
#### API implementation and reconciliation
For timeline creation/deletion we want to preserve the basic assumption that
unreachable minority (1 sk of 3) doesn't block their completion, but eventually
we want to finish creation/deletion on nodes which missed it (unless they are
removed). Similarly for migration; it may and should finish even though excluded
members missed their exclusion. And of course e.g. such pending exclusion on
node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As
another example, if some node missed timeline creation it clearly must not block
migration from it. Hence it is natural to have per safekeeper background
reconciler which retries these ops until they succeed. There are 3 possible
operation types, and the type is defined by timeline state (membership
configuration and whether it is deleted) and safekeeper id: we may need to
create timeline on sk (node added), locally delete it (node excluded, somewhat
similar to detach) or globally delete it (timeline is deleted).
Next, on storage controller restart in principle these pending operations can be
figured out by comparing safekeepers state against storcon state. But it seems
better to me to materialize them in the database; it is not expensive, avoids
these startup scans which themselves can fail etc and makes it very easy to see
outstanding work directly at the source of truth -- the db. So we can add table
`safekeeper_timeline_pending_ops`
```
table! {
// timeline_id, sk_id is primary key
safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) {
sk_id -> int8,
tenant_id -> Varchar,
timeline_id -> Varchar,
generation -> Int4,
op_type -> Varchar,
}
}
```
`op_type` can be `include` (seed from peers and ensure generation is up to
date), `exclude` (remove locally) and `delete`. Field is actually not strictly
needed as it can be computed from current configuration, but gives more explicit
observability.
`generation` is necessary there because after op is done reconciler must remove
it and not remove another row with higher gen which in theory might appear.
Any insert of row should overwrite (remove) all rows with the same sk and
timeline id but lower `generation` as next op makes previous obsolete. Insertion
of `op_type` `delete` overwrites all rows.
About `exclude`: rather than adding explicit safekeeper http endpoint, it is
reasonable to reuse membership switch endpoint: if safekeeper is not member
of the configuration it locally removes the timeline on the switch. In this case
404 should also be considered an 'ok' answer by the caller.
So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
joined with timeline configuration to get current conf (with generation `n`)
for the safekeeper and does the jobs, infinitely retrying failures:
1) If node is member (`include`):
- Check if timeline exists on it, if not, call pull_timeline on it from
other members
- Call switch configuration to the current
2) If node is not member (`exclude`):
- Call switch configuration to the current, 404 is ok.
3) If timeline is deleted (`delete`), call delete.
In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and
timeline with generation <= `n` if `op_type` is not `delete`.
In case 3 also remove `safekeeper_timeline_pending_ops`
entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline.
Let's consider in details how APIs can be implemented from this angle.
Timeline creation. It is assumed that cplane retries it until success, so all
actions must be idempotent. Now, a tricky point here is timeline start LSN. For
the initial (tenant creation) call cplane doesn't know it. However, setting
start_lsn on safekeepers during creation is a good thing -- it provides a
guarantee that walproposer can always find a common point in WAL histories of
safekeeper and its own, and so absense of it would be a clear sign of
corruption. The following sequence works:
1) Create timeline (or observe that it exists) on pageserver,
figuring out last_record_lsn in response.
2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the
db. Note that last_record_lsn returned on the previous step is movable as it
changes once ingestion starts, insert must not overwrite it (as well as other
fields like membership conf). On the contrary, start_lsn used in the next
step must be set to the value in the db. cplane_notified_generation can be set
to 1 (initial generation) in insert to avoid notifying cplane about initial
conf as cplane will receive it in timeline creation request anyway.
3) Issue timeline creation calls to at least majority of safekeepers. Using
majority here is not necessary but handy because it guarantees that any live
majority will have at least one sk with created timeline and so
reconciliation task can use pull_timeline shared with migration instead of
create timeline special init case. OFC if timeline is already exists call is
ignored.
4) For minority of safekeepers which could have missed creation insert
entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion
because response to cplane is sent only after it has happened, and cplane
retries the call until 200 response.
There is a small question how request handler (timeline creation in this
case) would interact with per sk reconciler. As always I prefer to do the
simplest possible thing and here it seems to be just waking it up so it
re-reads the db for work to do. Passing work in memory is faster, but
that shouldn't matter, and path to scan db for work will exist anyway,
simpler to reuse it.
For pg version / wal segment size: while we may persist them in `timelines`
table, it is not necessary as initial creation at step 3 can take them from
pageserver or cplane creation call and later pull_timeline will carry them
around.
Timeline migration.
1) CAS to the db to create joint conf, and in the same transaction create
`safekeeper_timeline_pending_ops` `include` entries to initialize new members
as well as deliver this conf to current ones; poke per sk reconcilers to work
on it. Also any conf change should also poke cplane notifier task(s).
2) Once it becomes possible per alg description above, get out of joint conf
with another CAS. Task should get wakeups from per sk reconcilers because
conf switch is required for advancement; however retries should be sleep
based as well as LSN advancement might be needed, though in happy path
it isn't. To see whether further transition is possible on wakup migration
executor polls safekeepers per the algorithm. CAS creating new conf with only
new members should again insert entries to `safekeeper_timeline_pending_ops`
to switch them there, as well as `exclude` rows to remove timeline from
old members.
Timeline deletion: just set `deleted_at` on the timeline row and insert
`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
per sk reconcilers.
When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
for it must be cleared in the same transaction.
One more task pool should infinitely retry notifying control plane about changed
safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
#### Dealing with multiple instances of storage_controller
Operations described above executed concurrently might create some errors but do
not prevent progress, so while we normally don't want to run multiple instances
of storage_controller it is fine to have it temporarily, e.g. during redeploy.
To harden against some controller instance creating some work in
`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
the job per sk reconcilers apart from explicit wakups should scan for work
periodically. It is possible to remove that though if all db updates are
protected with leadership token/term -- then such scans are needed only after
leadership is acquired.
Any interactions with db update in-memory controller state, e.g. if migration
request failed because different one is in progress, controller remembers that
and tries to finish it.
@@ -662,7 +545,7 @@ Aurora does this but similarly I don't think this is needed.
We should use Compute <-> safekeeper protocol change to include other (long
yearned) modifications:
- send data in network order without putting whole structs to be arch independent
- send data in network order to make arm work.
- remove term_start_lsn from AppendRequest
- add horizon to TermHistory
- add to ProposerGreeting number of connection from this wp to sk

View File

@@ -204,16 +204,14 @@ impl RemoteExtSpec {
// Check if extension is present in public or custom.
// If not, then it is not allowed to be used by this compute.
if !self
.public_extensions
.as_ref()
.is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
&& !self
.custom_extensions
.as_ref()
.is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
{
return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
if let Some(public_extensions) = &self.public_extensions {
if !public_extensions.contains(&real_ext_name.to_string()) {
if let Some(custom_extensions) = &self.custom_extensions {
if !custom_extensions.contains(&real_ext_name.to_string()) {
return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
}
}
}
}
match self.extension_data.get(real_ext_name) {
@@ -342,102 +340,6 @@ mod tests {
use super::*;
use std::fs::File;
#[test]
fn allow_installing_remote_extensions() {
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
"public_extensions": null,
"custom_extensions": null,
"library_index": {},
"extension_data": {},
}))
.unwrap();
rspec
.get_ext("ext", false, "latest", "v17")
.expect_err("Extension should not be found");
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
"public_extensions": [],
"custom_extensions": null,
"library_index": {},
"extension_data": {},
}))
.unwrap();
rspec
.get_ext("ext", false, "latest", "v17")
.expect_err("Extension should not be found");
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
"public_extensions": [],
"custom_extensions": [],
"library_index": {
"ext": "ext"
},
"extension_data": {
"ext": {
"control_data": {
"ext.control": ""
},
"archive_path": ""
}
},
}))
.unwrap();
rspec
.get_ext("ext", false, "latest", "v17")
.expect_err("Extension should not be found");
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
"public_extensions": [],
"custom_extensions": ["ext"],
"library_index": {
"ext": "ext"
},
"extension_data": {
"ext": {
"control_data": {
"ext.control": ""
},
"archive_path": ""
}
},
}))
.unwrap();
rspec
.get_ext("ext", false, "latest", "v17")
.expect("Extension should be found");
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
"public_extensions": ["ext"],
"custom_extensions": [],
"library_index": {
"extlib": "ext",
},
"extension_data": {
"ext": {
"control_data": {
"ext.control": ""
},
"archive_path": ""
}
},
}))
.unwrap();
rspec
.get_ext("ext", false, "latest", "v17")
.expect("Extension should be found");
// test library index for the case when library name
// doesn't match the extension name
rspec
.get_ext("extlib", true, "latest", "v17")
.expect("Library should be found");
}
#[test]
fn parse_spec_file() {
let file = File::open("tests/cluster_spec.json").unwrap();

View File

@@ -94,7 +94,6 @@ pub struct ConfigToml {
pub ondemand_download_behavior_treat_error_as_warn: bool,
#[serde(with = "humantime_serde")]
pub background_task_maximum_delay: Duration,
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<reqwest::Url>,
pub control_plane_api_token: Option<String>,
pub control_plane_emergency_mode: bool,
@@ -324,10 +323,6 @@ pub struct TenantConfigToml {
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
// How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
// Set to 0 to disable preemption.
pub image_creation_preempt_threshold: usize,
/// The length for an explicit LSN lease request.
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
#[serde(with = "humantime_serde")]
@@ -471,7 +466,6 @@ impl Default for ConfigToml {
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
)
.unwrap()),
use_compaction_semaphore: false,
control_plane_api: (None),
control_plane_api_token: (None),
@@ -553,10 +547,6 @@ pub mod tenant_conf_defaults {
// Relevant: https://github.com/neondatabase/neon/issues/3394
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
// If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
// layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
// want to enable this feature.
pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
@@ -615,7 +605,6 @@ impl Default for TenantConfigToml {
lazy_slru_download: false,
timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
timeline_offloading: false,

View File

@@ -498,8 +498,6 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub image_layer_creation_check_threshold: FieldPatch<u8>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub image_creation_preempt_threshold: FieldPatch<usize>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub lsn_lease_length: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub lsn_lease_length_for_ts: FieldPatch<String>,
@@ -546,7 +544,6 @@ pub struct TenantConfig {
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub image_creation_preempt_threshold: Option<usize>,
pub lsn_lease_length: Option<String>,
pub lsn_lease_length_for_ts: Option<String>,
pub timeline_offloading: Option<bool>,
@@ -584,7 +581,6 @@ impl TenantConfig {
mut lazy_slru_download,
mut timeline_get_throttle,
mut image_layer_creation_check_threshold,
mut image_creation_preempt_threshold,
mut lsn_lease_length,
mut lsn_lease_length_for_ts,
mut timeline_offloading,
@@ -639,9 +635,6 @@ impl TenantConfig {
patch
.image_layer_creation_check_threshold
.apply(&mut image_layer_creation_check_threshold);
patch
.image_creation_preempt_threshold
.apply(&mut image_creation_preempt_threshold);
patch.lsn_lease_length.apply(&mut lsn_lease_length);
patch
.lsn_lease_length_for_ts
@@ -686,7 +679,6 @@ impl TenantConfig {
lazy_slru_download,
timeline_get_throttle,
image_layer_creation_check_threshold,
image_creation_preempt_threshold,
lsn_lease_length,
lsn_lease_length_for_ts,
timeline_offloading,

View File

@@ -76,15 +76,7 @@ impl Conf {
let mut cmd = Command::new(path);
cmd.env_clear()
.env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
);
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
Ok(cmd)
}

View File

@@ -64,14 +64,6 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
.env_clear()
.env("LD_LIBRARY_PATH", library_search_path)
.env("DYLD_LIBRARY_PATH", library_search_path)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.stdin(std::process::Stdio::null())
// stdout invocation produces the same output every time, we don't need it
.stdout(std::process::Stdio::null())

View File

@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
use crate::{
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
/// External backup storage configuration, enough for creating a client for that storage.
@@ -45,11 +45,11 @@ impl RemoteStorageKind {
impl RemoteStorageConfig {
/// Helper to fetch the configured concurrency limit.
pub fn concurrency_limit(&self) -> usize {
pub fn concurrency_limit(&self) -> Option<usize> {
match &self.storage {
RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
RemoteStorageKind::LocalFs { .. } => None,
RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
}
}
}

View File

@@ -65,12 +65,6 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// Here, a limit of max 20k concurrent connections was noted.
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
/// Set this limit analogously to the S3 limit.
///
/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
/// No limits on the client side, which currenltly means 1000 for AWS S3.
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

View File

@@ -39,7 +39,7 @@ function initdb_with_args {
;;
esac
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}"
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
}
rm -fr "$DATA_DIR"

View File

@@ -5,27 +5,6 @@ use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, VariantNames};
/// Logs a critical error, similarly to `tracing::error!`. This will:
///
/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
/// * Trigger a pageable alert (via the metric below).
/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
/// * In debug builds, panic the process.
///
/// When including errors in the message, please use {err:?} to include the error cause and original
/// backtrace.
#[macro_export]
macro_rules! critical {
($($arg:tt)*) => {{
if cfg!(debug_assertions) {
panic!($($arg)*);
}
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
let backtrace = std::backtrace::Backtrace::capture();
tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
}};
}
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
#[strum(serialize_all = "snake_case")]
pub enum LogFormat {
@@ -46,10 +25,7 @@ impl LogFormat {
}
}
pub struct TracingEventCountMetric {
/// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
/// and also emit it as a regular error. These are thus double-counted, but that seems fine.
critical: IntCounter,
struct TracingEventCountMetric {
error: IntCounter,
warn: IntCounter,
info: IntCounter,
@@ -57,7 +33,7 @@ pub struct TracingEventCountMetric {
trace: IntCounter,
}
pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
"libmetrics_tracing_event_count",
"Number of tracing events, by level",
@@ -70,7 +46,6 @@ pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new
impl TracingEventCountMetric {
fn new(vec: IntCounterVec) -> Self {
Self {
critical: vec.with_label_values(&["critical"]),
error: vec.with_label_values(&["error"]),
warn: vec.with_label_values(&["warn"]),
info: vec.with_label_values(&["info"]),
@@ -79,11 +54,6 @@ impl TracingEventCountMetric {
}
}
// Allow public access from `critical!` macro.
pub fn inc_critical(&self) {
self.critical.inc();
}
fn inc_for_level(&self, level: tracing::Level) {
let counter = match level {
tracing::Level::ERROR => &self.error,

View File

@@ -177,8 +177,8 @@ impl FileCacheState {
crate::spawn_with_cancel(
token,
|res| {
if let Err(e) = res {
error!(error = format_args!("{e:#}"), "postgres error");
if let Err(error) = res {
error!(%error, "postgres error")
}
},
conn,
@@ -205,7 +205,7 @@ impl FileCacheState {
{
Ok(rows) => Ok(rows),
Err(e) => {
error!(error = format_args!("{e:#}"), "postgres error -> retrying");
error!(error = ?e, "postgres error: {e} -> retrying");
let client = FileCacheState::connect(&self.conn_str, self.token.clone())
.await

View File

@@ -191,12 +191,15 @@ async fn start_monitor(
.await;
let mut monitor = match monitor {
Ok(Ok(monitor)) => monitor,
Ok(Err(e)) => {
error!(error = format_args!("{e:#}"), "failed to create monitor");
Ok(Err(error)) => {
error!(?error, "failed to create monitor");
return;
}
Err(_) => {
error!(?timeout, "creating monitor timed out");
error!(
?timeout,
"creating monitor timed out (probably waiting to receive protocol range)"
);
return;
}
};
@@ -204,9 +207,6 @@ async fn start_monitor(
match monitor.run().await {
Ok(()) => info!("monitor was killed due to new connection"),
Err(e) => error!(
error = format_args!("{e:#}"),
"monitor terminated unexpectedly"
),
Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
}
}

View File

@@ -370,16 +370,12 @@ impl Runner {
}),
InboundMsgKind::InvalidMessage { error } => {
warn!(
error = format_args!("{error:#}"),
id, "received notification of an invalid message we sent"
%error, id, "received notification of an invalid message we sent"
);
Ok(None)
}
InboundMsgKind::InternalError { error } => {
warn!(
error = format_args!("{error:#}"),
id, "agent experienced an internal error"
);
warn!(error, id, "agent experienced an internal error");
Ok(None)
}
InboundMsgKind::HealthCheck {} => {
@@ -480,7 +476,7 @@ impl Runner {
// gives the outermost cause, and the debug impl
// pretty-prints the error, whereas {:#} contains all the
// causes, but is compact (no newlines).
warn!(error = format_args!("{e:#}"), "error handling message");
warn!(error = format!("{e:#}"), "error handling message");
OutboundMsg::new(
OutboundMsgKind::InternalError {
error: e.to_string(),
@@ -496,7 +492,7 @@ impl Runner {
.context("failed to send message")?;
}
Err(e) => warn!(
error = format_args!("{e:#}"),
error = format!("{e}"),
msg = ?msg,
"received error message"
),

View File

@@ -140,10 +140,6 @@ pub struct PageServerConf {
/// not terrible.
pub background_task_maximum_delay: Duration,
/// If true, use a separate semaphore for compaction tasks instead of the common background task
/// semaphore. Defaults to false.
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<Url>,
/// JWT token for use with the control plane API.
@@ -336,7 +332,6 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_api_token,
control_plane_emergency_mode,
@@ -390,7 +385,6 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_emergency_mode,
heatmap_upload_concurrency,

View File

@@ -8,6 +8,7 @@ use std::time::Duration;
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::remote_timeline_path;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
use crate::virtual_file::MaybeFatalIo;
@@ -462,18 +463,45 @@ impl DeletionQueueClient {
///
/// The `current_generation` is the generation of this pageserver's current attachment. The
/// generations in `layers` are the generations in which those layers were written.
pub(crate) fn push_layers(
pub(crate) async fn push_layers(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerName, LayerFileMetadata)>,
) -> Result<(), DeletionQueueError> {
// None generations are not valid for attached tenants: they must always be attached in
// a known generation. None generations are still permitted for layers in the index because
// they may be historical.
assert!(!current_generation.is_none());
if current_generation.is_none() {
debug!("Enqueuing deletions in legacy mode, skipping queue");
let mut layer_paths = Vec::new();
for (layer, meta) in layers {
layer_paths.push(remote_layer_path(
&tenant_shard_id.tenant_id,
&timeline_id,
meta.shard,
&layer,
meta.generation,
));
}
self.push_immediate(layer_paths).await?;
return self.flush_immediate().await;
}
self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
}
/// When a Tenant has a generation, push_layers is always synchronous because
/// the ListValidator channel is an unbounded channel.
///
/// This can be merged into push_layers when we remove the Generation-less mode
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
pub(crate) fn push_layers_sync(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerName, LayerFileMetadata)>,
) -> Result<(), DeletionQueueError> {
metrics::DELETION_QUEUE
.keys_submitted
.inc_by(layers.len() as u64);
@@ -929,12 +957,14 @@ mod test {
// File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
info!("Pushing");
client.push_layers(
tenant_shard_id,
TIMELINE_ID,
now_generation,
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
)?;
client
.push_layers(
tenant_shard_id,
TIMELINE_ID,
now_generation,
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
)
.await?;
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
assert_local_files(&[], &deletion_prefix);
@@ -987,12 +1017,14 @@ mod test {
assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
tracing::debug!("Pushing...");
client.push_layers(
tenant_shard_id,
TIMELINE_ID,
stale_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
)?;
client
.push_layers(
tenant_shard_id,
TIMELINE_ID,
stale_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
)
.await?;
// We enqueued the operation in a stale generation: it should have failed validation
tracing::debug!("Flushing...");
@@ -1000,12 +1032,14 @@ mod test {
assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
tracing::debug!("Pushing...");
client.push_layers(
tenant_shard_id,
TIMELINE_ID,
latest_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
)?;
client
.push_layers(
tenant_shard_id,
TIMELINE_ID,
latest_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
)
.await?;
// We enqueued the operation in a fresh generation: it should have passed validation
tracing::debug!("Flushing...");
@@ -1040,24 +1074,28 @@ mod test {
// generation gets that treatment)
let remote_layer_file_name_historical =
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
client.push_layers(
tenant_shard_id,
TIMELINE_ID,
now_generation.previous(),
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
)?;
client
.push_layers(
tenant_shard_id,
TIMELINE_ID,
now_generation.previous(),
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
)
.await?;
// Inject a deletion in the generation before generation_now: after restart,
// this deletion should get executed, because we execute deletions in the
// immediately previous generation on the same node.
let remote_layer_file_name_previous =
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
client.push_layers(
tenant_shard_id,
TIMELINE_ID,
now_generation,
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
)?;
client
.push_layers(
tenant_shard_id,
TIMELINE_ID,
now_generation,
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
)
.await?;
client.flush().await?;
assert_remote_files(
@@ -1101,7 +1139,6 @@ pub(crate) mod mock {
use tracing::info;
use super::*;
use crate::tenant::remote_timeline_client::remote_layer_path;
use std::sync::atomic::{AtomicUsize, Ordering};
pub struct ConsumerState {

View File

@@ -61,7 +61,6 @@ use crate::{
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
tasks::sleep_random,
},
CancellableTask, DiskUsageEvictionTask,
};
@@ -211,8 +210,14 @@ async fn disk_usage_eviction_task(
info!("disk usage based eviction task finishing");
};
if sleep_random(task_config.period, &cancel).await.is_err() {
return;
use crate::tenant::tasks::random_init_delay;
{
if random_init_delay(task_config.period, &cancel)
.await
.is_err()
{
return;
}
}
let mut iteration_no = 0;

View File

@@ -263,6 +263,14 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
/// data directory at pageserver startup can be automatically removed.
pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
/// A marker file to mark that a timeline directory was not fully initialized.
/// If a timeline directory with this marker is encountered at pageserver startup,
/// the timeline directory and the marker file are both removed.
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
pub fn is_temporary(path: &Utf8Path) -> bool {
match path.file_name() {
Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
@@ -270,6 +278,25 @@ pub fn is_temporary(path: &Utf8Path) -> bool {
}
}
fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
match path.file_name() {
Some(name) => name.ends_with(suffix),
None => false,
}
}
// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
// from the name.
pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
}
pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
}
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
/// blocking.
///

View File

@@ -6,7 +6,7 @@ use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::{Duration, Instant};
use enum_map::{Enum as _, EnumMap};
use enum_map::EnumMap;
use futures::Future;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -32,7 +32,6 @@ use utils::id::TimelineId;
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext};
use crate::pgdatadir_mapping::DatadirModificationStats;
use crate::task_mgr::TaskKind;
use crate::tenant::layer_map::LayerMap;
use crate::tenant::mgr::TenantSlot;
@@ -104,7 +103,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
.expect("failed to define a metric")
});
// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
// Buckets for background operations like compaction, GC, size calculation
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
@@ -236,7 +235,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
GetVectoredLatency {
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
let task_kind = TaskKind::from_usize(task_kind_idx);
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
let task_kind = task_kind.into();
@@ -259,7 +258,7 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
ScanLatency {
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
let task_kind = TaskKind::from_usize(task_kind_idx);
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
let task_kind = task_kind.into();
@@ -300,10 +299,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
let task_kind = TaskKind::from_usize(task_kind);
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
let task_kind: &'static str = task_kind.into();
EnumMap::from_array(std::array::from_fn(|content_kind| {
let content_kind = PageContentKind::from_usize(content_kind);
let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
let content_kind: &'static str = content_kind.into();
PageCacheMetricsForTaskKind {
read_accesses_immutable: {
@@ -1913,7 +1912,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy
ComputeCommandCounters {
map: EnumMap::from_array(std::array::from_fn(|i| {
let command = ComputeCommandKind::from_usize(i);
let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
let command_str: &'static str = command.into();
inner.with_label_values(&[command_str])
})),
@@ -2213,13 +2212,11 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
pub struct BackgroundLoopSemaphoreMetrics {
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
durations: EnumMap<BackgroundLoopKind, Histogram>,
waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
durations: EnumMap<BackgroundLoopKind, Counter>,
}
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
Lazy::new(|| {
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|| {
let counters = register_int_counter_pair_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
@@ -2229,101 +2226,45 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
)
.unwrap();
let durations = register_histogram_vec!(
"pageserver_background_loop_semaphore_wait_seconds",
"Seconds spent waiting on background loop semaphore acquisition",
&["task"],
vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
)
.unwrap();
let waiting_tasks = register_int_gauge_vec!(
"pageserver_background_loop_semaphore_waiting_tasks",
"Number of background loop tasks waiting for semaphore",
&["task"],
)
.unwrap();
let running_tasks = register_int_gauge_vec!(
"pageserver_background_loop_semaphore_running_tasks",
"Number of background loop tasks running concurrently",
let durations = register_counter_vec!(
"pageserver_background_loop_semaphore_wait_duration_seconds",
"Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
&["task"],
)
.unwrap();
BackgroundLoopSemaphoreMetrics {
counters: EnumMap::from_array(std::array::from_fn(|i| {
let kind = BackgroundLoopKind::from_usize(i);
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
counters.with_label_values(&[kind.into()])
})),
durations: EnumMap::from_array(std::array::from_fn(|i| {
let kind = BackgroundLoopKind::from_usize(i);
durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
durations.with_label_values(&[kind.into()])
})),
waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
let kind = BackgroundLoopKind::from_usize(i);
waiting_tasks.with_label_values(&[kind.into()])
})),
running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
let kind = BackgroundLoopKind::from_usize(i);
running_tasks.with_label_values(&[kind.into()])
})),
}
});
},
);
impl BackgroundLoopSemaphoreMetrics {
/// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
/// semaphore is acquired, and drop it when the task completes or is cancelled.
pub(crate) fn record(
&self,
task: BackgroundLoopKind,
) -> BackgroundLoopSemaphoreMetricsRecorder {
BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
}
}
/// Records metrics for a background task.
pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
metrics: &'a BackgroundLoopSemaphoreMetrics,
task: BackgroundLoopKind,
start: Instant,
wait_counter_guard: Option<metrics::IntCounterPairGuard>,
}
impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
/// Starts recording semaphore metrics, by recording wait time and incrementing
/// `wait_start_count` and `waiting_tasks`.
fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
metrics.waiting_tasks[task].inc();
Self {
metrics,
task,
start: Instant::now(),
wait_counter_guard: Some(metrics.counters[task].guard()),
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
struct Record<'a> {
metrics: &'a BackgroundLoopSemaphoreMetrics,
task: BackgroundLoopKind,
_counter_guard: metrics::IntCounterPairGuard,
start: Instant,
}
}
/// Signals that the semaphore has been acquired, and updates relevant metrics.
pub fn acquired(&mut self) -> Duration {
let waited = self.start.elapsed();
self.wait_counter_guard.take().expect("already acquired");
self.metrics.durations[self.task].observe(waited.as_secs_f64());
self.metrics.waiting_tasks[self.task].dec();
self.metrics.running_tasks[self.task].inc();
waited
}
}
impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
/// The task either completed or was cancelled.
fn drop(&mut self) {
if self.wait_counter_guard.take().is_some() {
// Waiting.
self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
self.metrics.waiting_tasks[self.task].dec();
} else {
// Running.
self.metrics.running_tasks[self.task].dec();
impl Drop for Record<'_> {
fn drop(&mut self) {
let elapsed = self.start.elapsed().as_secs_f64();
self.metrics.durations[self.task].inc_by(elapsed);
}
}
Record {
metrics: self,
task,
_counter_guard: self.counters[task].guard(),
start: Instant::now(),
}
}
}
@@ -2437,40 +2378,11 @@ pub(crate) struct WalIngestMetrics {
pub(crate) records_observed: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) values_committed_metadata_images: IntCounter,
pub(crate) values_committed_metadata_deltas: IntCounter,
pub(crate) values_committed_data_images: IntCounter,
pub(crate) values_committed_data_deltas: IntCounter,
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
}
impl WalIngestMetrics {
pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
if stats.metadata_images > 0 {
self.values_committed_metadata_images
.inc_by(stats.metadata_images);
}
if stats.metadata_deltas > 0 {
self.values_committed_metadata_deltas
.inc_by(stats.metadata_deltas);
}
if stats.data_images > 0 {
self.values_committed_data_images.inc_by(stats.data_images);
}
if stats.data_deltas > 0 {
self.values_committed_data_deltas.inc_by(stats.data_deltas);
}
}
pub(crate) clear_vm_bits_unknown: IntCounterVec,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
let values_committed = register_int_counter_vec!(
"pageserver_wal_ingest_values_committed",
"Number of values committed to pageserver storage from WAL records",
&["class", "kind"],
)
.expect("failed to define a metric");
WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
@@ -2497,15 +2409,17 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
gap_blocks_zeroed_on_rel_extend: register_int_counter!(
"pageserver_gap_blocks_zeroed_on_rel_extend",
"Total number of zero gap blocks written on relation extends"
)
.expect("failed to define a metric"),
clear_vm_bits_unknown: register_int_counter_vec!(
"pageserver_wal_ingest_clear_vm_bits_unknown",
"Number of ignored ClearVmBits operations due to unknown pages/relations",
&["entity"],
)
.expect("failed to define a metric"),
}
});
@@ -2572,7 +2486,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
pub(crate) struct WalRedoProcessCounters {
pub(crate) started: IntCounter,
pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
pub(crate) active_stderr_logger_tasks_started: IntCounter,
pub(crate) active_stderr_logger_tasks_finished: IntCounter,
}
@@ -2614,7 +2528,7 @@ impl Default for WalRedoProcessCounters {
Self {
started,
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
let cause = WalRedoKillCause::from_usize(i);
let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
let cause_str: &'static str = cause.into();
killed.with_label_values(&[cause_str])
})),

View File

@@ -489,6 +489,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
let timeline = tenant_shard
.get_timeline(timeline_id, true)
.map_err(GetActiveTimelineError::Timeline)?;
set_tracing_field_shard_id(&timeline);
Ok(timeline)
}
}
@@ -773,11 +774,11 @@ impl PageServerHandler {
let batched_msg = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
.await?;
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetRelExists,
@@ -792,10 +793,11 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::Nblocks(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetRelSize,
@@ -810,10 +812,11 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::DbSize(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetDbSize,
@@ -828,10 +831,11 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::GetSlruSegment(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
let timer = record_op_start_and_throttle(
&shard,
metrics::SmgrQueryType::GetSlruSegment,
@@ -846,20 +850,12 @@ impl PageServerHandler {
}
}
PagestreamFeMessage::GetPage(req) => {
// avoid a somewhat costly Span::record() by constructing the entire span in one go.
macro_rules! mkspan {
(before shard routing) => {{
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
}};
($shard_id:expr) => {{
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
}};
}
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
macro_rules! respond_error {
($span:expr, $error:expr) => {{
($error:expr) => {{
let error = BatchedFeMessage::RespondError {
span: $span,
span,
error: BatchedPageStreamError {
req: req.hdr,
err: $error,
@@ -872,35 +868,27 @@ impl PageServerHandler {
let key = rel_block_to_key(req.rel, req.blkno);
let shard = match timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Page(key))
.instrument(span.clone()) // sets `shard_id` field
.await
{
Ok(tl) => tl,
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node: the client's knowledge of shard->pageserver
// mapping is out of date.
//
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
// and talk to a different pageserver.
return respond_error!(PageStreamError::Reconnect(
"getpage@lsn request routed to wrong shard".into()
));
}
Err(e) => {
let span = mkspan!(before shard routing);
match e {
GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node: the client's knowledge of shard->pageserver
// mapping is out of date.
//
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
// and talk to a different pageserver.
return respond_error!(
span,
PageStreamError::Reconnect(
"getpage@lsn request routed to wrong shard".into()
)
);
}
e => {
return respond_error!(span, e.into());
}
}
return respond_error!(e.into());
}
};
let span = mkspan!(shard.tenant_shard_id.shard_slug());
let timer = record_op_start_and_throttle(
&shard,
@@ -922,7 +910,7 @@ impl PageServerHandler {
{
Ok(lsn) => lsn,
Err(e) => {
return respond_error!(span, e);
return respond_error!(e);
}
};
BatchedFeMessage::GetPage {
@@ -934,10 +922,11 @@ impl PageServerHandler {
}
#[cfg(feature = "testing")]
PagestreamFeMessage::Test(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
.await?;
let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
let timer =
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
.await?;
@@ -1201,29 +1190,6 @@ impl PageServerHandler {
}
};
// We purposefully don't count flush time into the smgr operaiton timer.
//
// The reason is that current compute client will not perform protocol processing
// if the postgres backend process is doing things other than `->smgr_read()`.
// This is especially the case for prefetch.
//
// If the compute doesn't read from the connection, eventually TCP will backpressure
// all the way into our flush call below.
//
// The timer's underlying metric is used for a storage-internal latency SLO and
// we don't want to include latency in it that we can't control.
// And as pointed out above, in this case, we don't control the time that flush will take.
//
// We put each response in the batch onto the wire in a separate pgb_writer.flush()
// call, which (all unmeasured) adds syscall overhead but reduces time to first byte
// and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
// TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
//
// Since we're flushing multiple times in the loop, but only have access to the per-op
// timers inside the loop, we capture the flush start time here and reuse it to finish
// each op timer.
let flushing_start_time = Instant::now();
// Map handler result to protocol behavior.
// Some handler errors cause exit from pagestream protocol.
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
@@ -1272,9 +1238,21 @@ impl PageServerHandler {
&response_msg.serialize(protocol_version),
))?;
// We purposefully don't count flush time into the timer.
//
// The reason is that current compute client will not perform protocol processing
// if the postgres backend process is doing things other than `->smgr_read()`.
// This is especially the case for prefetch.
//
// If the compute doesn't read from the connection, eventually TCP will backpressure
// all the way into our flush call below.
//
// The timer's underlying metric is used for a storage-internal latency SLO and
// we don't want to include latency in it that we can't control.
// And as pointed out above, in this case, we don't control the time that flush will take.
let flushing_timer = timer.map(|mut timer| {
timer
.observe_execution_end_flush_start(flushing_start_time)
.observe_execution_end_flush_start(Instant::now())
.expect("we are the first caller")
});
@@ -1302,6 +1280,8 @@ impl PageServerHandler {
}
Ok(())
}
// and log the info! line inside the request span
.instrument(span.clone())
.await?;
}
Ok(())
@@ -1362,7 +1342,7 @@ impl PageServerHandler {
.take()
.expect("implementation error: timeline_handles should not be locked");
let request_span = info_span!("request");
let request_span = info_span!("request", shard_id = tracing::field::Empty);
let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
PageServicePipeliningConfig::Pipelined(pipelining_config) => {
self.handle_pagerequests_pipelined(
@@ -1712,7 +1692,7 @@ impl PageServerHandler {
// to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
let gc_info = &timeline.gc_info.read().unwrap();
if !gc_info.lsn_covered_by_lease(request_lsn) {
if !gc_info.leases.contains_key(&request_lsn) {
return Err(
PageStreamError::BadRequest(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
@@ -2056,13 +2036,6 @@ impl PageServerHandler {
.unwrap()
.get(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
set_tracing_field_shard_id(&timeline);
if timeline.is_archived() == Some(true) {
// TODO after a grace period, turn this log line into a hard error
tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
//return Err(QueryError::NotFound("timeline is archived".into()))
}
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {

View File

@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::pausable_failpoint;
use utils::{bin_ser::BeSer, lsn::Lsn};
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
use wal_decoder::serialized_batch::SerializedValueBatch;
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -612,18 +612,11 @@ impl Timeline {
pausable_failpoint!("find-lsn-for-timestamp-pausable");
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let gc_cutoff_planned = {
let gc_info = self.gc_info.read().unwrap();
gc_info.min_cutoff()
};
// Usually the planned cutoff is newer than the cutoff of the last gc run,
// but let's be defensive.
let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard);
// We use this method to figure out the branching LSN for the new branch, but the
// GC cutoff could be before the branching point and we cannot create a new branch
// with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
// on the safe side.
let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn());
let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
let max_lsn = self.get_last_record_lsn();
// LSNs are always 8-byte aligned. low/mid/high represent the
@@ -1304,26 +1297,6 @@ impl DatadirModification<'_> {
.is_some_and(|b| b.has_data())
}
/// Returns statistics about the currently pending modifications.
pub(crate) fn stats(&self) -> DatadirModificationStats {
let mut stats = DatadirModificationStats::default();
for (_, _, value) in self.pending_metadata_pages.values().flatten() {
match value {
Value::Image(_) => stats.metadata_images += 1,
Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
Value::WalRecord(_) => stats.metadata_deltas += 1,
}
}
for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
match valuemeta {
ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
ValueMeta::Serialized(_) => stats.data_deltas += 1,
ValueMeta::Observed(_) => {}
}
}
stats
}
/// Set the current lsn
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
ensure!(
@@ -2344,15 +2317,6 @@ impl DatadirModification<'_> {
}
}
/// Statistics for a DatadirModification.
#[derive(Default)]
pub struct DatadirModificationStats {
pub metadata_images: u64,
pub metadata_deltas: u64,
pub data_images: u64,
pub data_deltas: u64,
}
/// This struct facilitates accessing either a committed key from the timeline at a
/// specific LSN, or the latest uncommitted key from a pending modification.
///

View File

@@ -328,8 +328,8 @@ pub enum TaskKind {
// Eviction. One per timeline.
Eviction,
// Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.).
TenantHousekeeping,
// Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
IngestHousekeeping,
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,

View File

@@ -20,7 +20,6 @@ use chrono::NaiveDateTime;
use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use itertools::Itertools as _;
use pageserver_api::models;
use pageserver_api::models::CompactInfoResponse;
use pageserver_api::models::LsnLease;
@@ -47,7 +46,6 @@ use std::sync::atomic::AtomicBool;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::compaction::CompactionOutcome;
use timeline::compaction::GcCompactionQueue;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
@@ -97,6 +95,7 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::CONCURRENT_INITDBS;
use crate::metrics::INITDB_RUN_TIME;
@@ -1794,7 +1793,11 @@ impl Tenant {
let entry = entry.context("read timeline dir entry")?;
let entry_path = entry.path();
let purge = if crate::is_temporary(entry_path) {
let purge = if crate::is_temporary(entry_path)
// TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
|| is_uninit_mark(entry_path)
|| crate::is_delete_mark(entry_path)
{
true
} else {
match TimelineId::try_from(entry_path.file_name()) {
@@ -2909,10 +2912,10 @@ impl Tenant {
self: &Arc<Self>,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<CompactionOutcome, timeline::CompactionError> {
) -> Result<bool, timeline::CompactionError> {
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
if !self.is_active() {
return Ok(CompactionOutcome::Done);
return Ok(false);
}
{
@@ -2926,7 +2929,7 @@ impl Tenant {
// to AttachedSingle state.
if !conf.location.may_upload_layers_hint() {
info!("Skipping compaction in location state {:?}", conf.location);
return Ok(CompactionOutcome::Done);
return Ok(false);
}
}
@@ -2969,7 +2972,7 @@ impl Tenant {
// Before doing any I/O work, check our circuit breaker
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
info!("Skipping compaction due to previous failures");
return Ok(CompactionOutcome::Done);
return Ok(false);
}
let mut has_pending_task = false;
@@ -2977,10 +2980,10 @@ impl Tenant {
for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
{
// pending_task_left == None: cannot compact, maybe still pending tasks
// pending_task_left == Some(Pending): compaction task left
// pending_task_left == Some(Done): no compaction task left
// pending_task_left == Some(true): compaction task left
// pending_task_left == Some(false): no compaction task left
let pending_task_left = if *can_compact {
let compaction_outcome = timeline
let has_pending_l0_compaction_task = timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await
@@ -2998,27 +3001,27 @@ impl Tenant {
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
}
})?;
if let CompactionOutcome::Pending = compaction_outcome {
Some(CompactionOutcome::Pending)
if has_pending_l0_compaction_task {
Some(true)
} else {
let queue = {
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard.get(timeline_id).cloned()
};
if let Some(queue) = queue {
let outcome = queue
let has_pending_tasks = queue
.iteration(cancel, ctx, &self.gc_block, timeline)
.await?;
Some(outcome)
Some(has_pending_tasks)
} else {
Some(CompactionOutcome::Done)
Some(false)
}
}
} else {
None
};
has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
has_pending_task |= pending_task_left.unwrap_or(false);
if pending_task_left == Some(false) && *can_offload {
pausable_failpoint!("before-timeline-auto-offload");
match offload_timeline(self, timeline)
.instrument(info_span!("offload_timeline", %timeline_id))
@@ -3038,11 +3041,7 @@ impl Tenant {
.unwrap()
.success(&CIRCUIT_BREAKERS_UNBROKEN);
Ok(if has_pending_task {
CompactionOutcome::Pending
} else {
CompactionOutcome::Done
})
Ok(has_pending_task)
}
/// Cancel scheduled compaction tasks
@@ -3089,28 +3088,32 @@ impl Tenant {
Ok(rx)
}
/// Performs periodic housekeeping, via the tenant housekeeping background task.
async fn housekeeping(&self) {
// Call through to all timelines to freeze ephemeral layers as needed. This usually happens
// during ingest, but we don't want idle timelines to hold open layers for too long.
let timelines = self
.timelines
.lock()
.unwrap()
.values()
.filter(|tli| tli.is_active())
.cloned()
.collect_vec();
// Call through to all timelines to freeze ephemeral layers if needed. Usually
// this happens during ingest: this background housekeeping is for freezing layers
// that are open but haven't been written to for some time.
async fn ingest_housekeeping(&self) {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// compactions. We don't want to block everything else while the
// compaction runs.
let timelines = {
self.timelines
.lock()
.unwrap()
.values()
.filter_map(|timeline| {
if timeline.is_active() {
Some(timeline.clone())
} else {
None
}
})
.collect::<Vec<_>>()
};
for timeline in timelines {
for timeline in &timelines {
timeline.maybe_freeze_ephemeral_layer().await;
}
// Shut down walredo if idle.
const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180);
if let Some(ref walredo_mgr) = self.walredo_mgr {
walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
}
}
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
@@ -4639,26 +4642,22 @@ impl Tenant {
// check against last actual 'latest_gc_cutoff' first
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
src_timeline
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.context(format!(
"invalid branch start lsn: less than latest GC cutoff {}",
*latest_gc_cutoff_lsn,
))
.map_err(CreateTimelineError::AncestorLsn)?;
// and then the planned GC cutoff
{
let gc_info = src_timeline.gc_info.read().unwrap();
let planned_cutoff = gc_info.min_cutoff();
if gc_info.lsn_covered_by_lease(start_lsn) {
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
} else {
src_timeline
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.context(format!(
"invalid branch start lsn: less than latest GC cutoff {}",
*latest_gc_cutoff_lsn,
))
.map_err(CreateTimelineError::AncestorLsn)?;
// and then the planned GC cutoff
if start_lsn < planned_cutoff {
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
"invalid branch start lsn: less than planned GC cutoff {planned_cutoff}"
)));
}
let cutoff = gc_info.min_cutoff();
if start_lsn < cutoff {
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
)));
}
}
@@ -5492,9 +5491,6 @@ pub(crate) mod harness {
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
image_creation_preempt_threshold: Some(
tenant_conf.image_creation_preempt_threshold,
),
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
timeline_offloading: Some(tenant_conf.timeline_offloading),

View File

@@ -357,9 +357,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
pub image_layer_creation_check_threshold: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
pub image_creation_preempt_threshold: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
@@ -456,9 +453,6 @@ impl TenantConfOpt {
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
image_creation_preempt_threshold: self
.image_creation_preempt_threshold
.unwrap_or(global_conf.image_creation_preempt_threshold),
lsn_lease_length: self
.lsn_lease_length
.unwrap_or(global_conf.lsn_lease_length),
@@ -510,7 +504,6 @@ impl TenantConfOpt {
mut lazy_slru_download,
mut timeline_get_throttle,
mut image_layer_creation_check_threshold,
mut image_creation_preempt_threshold,
mut lsn_lease_length,
mut lsn_lease_length_for_ts,
mut timeline_offloading,
@@ -585,9 +578,6 @@ impl TenantConfOpt {
patch
.image_layer_creation_check_threshold
.apply(&mut image_layer_creation_check_threshold);
patch
.image_creation_preempt_threshold
.apply(&mut image_creation_preempt_threshold);
patch
.lsn_lease_length
.map(|v| humantime::parse_duration(&v))?
@@ -636,7 +626,6 @@ impl TenantConfOpt {
lazy_slru_download,
timeline_get_throttle,
image_layer_creation_check_threshold,
image_creation_preempt_threshold,
lsn_lease_length,
lsn_lease_length_for_ts,
timeline_offloading,
@@ -700,7 +689,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle,
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
image_creation_preempt_threshold: value.image_creation_preempt_threshold,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
timeline_offloading: value.timeline_offloading,

View File

@@ -437,7 +437,8 @@ impl RemoteTimelineClient {
.conf
.remote_storage_config
.as_ref()
.map_or(0, |r| r.concurrency_limit());
.and_then(|r| r.concurrency_limit())
.unwrap_or(0);
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
self.update_remote_physical_size_gauge(Some(index_part));
@@ -460,7 +461,8 @@ impl RemoteTimelineClient {
.conf
.remote_storage_config
.as_ref()
.map_or(0, |r| r.concurrency_limit());
.and_then(|r| r.concurrency_limit())
.unwrap_or(0);
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
self.update_remote_physical_size_gauge(None);
@@ -482,7 +484,8 @@ impl RemoteTimelineClient {
.conf
.remote_storage_config
.as_ref()
.map_or(0, |r| r.concurrency_limit());
.and_then(|r| r.concurrency_limit())
.unwrap_or(0);
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
@@ -517,7 +520,7 @@ impl RemoteTimelineClient {
if let Ok(queue) = queue_locked.initialized_mut() {
let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
for d in blocked_deletions {
if let Err(e) = self.deletion_queue_client.push_layers(
if let Err(e) = self.deletion_queue_client.push_layers_sync(
self.tenant_shard_id,
self.timeline_id,
self.generation,
@@ -2151,6 +2154,7 @@ impl RemoteTimelineClient {
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e))
}
}

View File

@@ -9,14 +9,13 @@ use crate::{
metrics::SECONDARY_MODE,
tenant::{
config::AttachmentMode,
mgr::{GetTenantError, TenantManager},
mgr::GetTenantError,
mgr::TenantManager,
remote_timeline_client::remote_heatmap_path,
span::debug_assert_current_span_has_tenant_id,
tasks::{warn_when_period_overrun, BackgroundLoopKind},
Tenant,
},
virtual_file::VirtualFile,
TEMP_FILE_SUFFIX,
};
use futures::Future;
@@ -33,10 +32,7 @@ use super::{
};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, Instrument};
use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
yielding_loop::yielding_loop,
};
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
@@ -465,18 +461,6 @@ async fn upload_tenant_heatmap(
}
}
// After a successful upload persist the fresh heatmap to disk.
// When restarting, the tenant will read the heatmap from disk
// and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
// If the heatmap is stale, the additive generation can lead to keeping previously
// evicted timelines on the secondarie's disk.
let tenant_shard_id = tenant.get_tenant_shard_id();
let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
}
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {

View File

@@ -166,10 +166,6 @@ impl BatchLayerWriter {
// END: catch every error and do the recovery in the above section
Ok(generated_layers)
}
pub fn pending_layer_num(&self) -> usize {
self.generated_layer_writers.len()
}
}
/// An image writer that takes images and produces multiple image layers.

View File

@@ -1,80 +1,52 @@
//! This module contains per-tenant background processes, e.g. compaction and GC.
//! This module contains functions to serve per-tenant background processes,
//! such as compaction and GC
use std::cmp::max;
use std::future::Future;
use std::ops::{ControlFlow, RangeInclusive};
use std::pin::pin;
use std::sync::{Arc, Mutex};
use std::ops::ControlFlow;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant};
use once_cell::sync::Lazy;
use rand::Rng;
use scopeguard::defer;
use tokio::sync::{Semaphore, SemaphorePermit};
use tokio_util::sync::CancellationToken;
use tracing::*;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::compaction::CompactionOutcome;
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
use utils::completion::Barrier;
use utils::rate_limit::RateLimit;
use utils::{backoff, pausable_failpoint};
use rand::Rng;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{backoff, completion, pausable_failpoint};
/// Semaphore limiting concurrent background tasks (across all tenants).
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
let total_threads = TOKIO_WORKER_THREADS.get();
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(permits < total_threads, "need threads for other work");
Semaphore::new(permits)
});
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
let permits = usize::max(
1,
// while a lot of the work is done on spawn_blocking, we still do
// repartitioning in the async context. this should give leave us some workers
// unblocked to be blocked on other work, hopefully easing any outside visible
// effects of restarts.
//
// 6/8 is a guess; previously we ran with unlimited 8 and more from
// spawn_blocking.
(total_threads * 3).checked_div(4).unwrap_or(0),
);
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(
permits < total_threads,
"need threads avail for shorter work"
);
tokio::sync::Semaphore::new(permits)
});
/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
/// default, see `use_compaction_semaphore`.
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
///
/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
/// to avoid high read amp during heavy write workloads.
///
/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
let total_threads = TOKIO_WORKER_THREADS.get();
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(permits < total_threads, "need threads for other work");
Semaphore::new(permits)
});
/// Background jobs.
///
/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
/// do any significant IO.
#[derive(
Debug,
PartialEq,
Eq,
Clone,
Copy,
strum_macros::IntoStaticStr,
strum_macros::Display,
enum_map::Enum,
)]
#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
#[strum(serialize_all = "snake_case")]
pub(crate) enum BackgroundLoopKind {
Compaction,
Gc,
Eviction,
TenantHouseKeeping,
IngestHouseKeeping,
ConsumptionMetricsCollectMetrics,
ConsumptionMetricsSyntheticSizeWorker,
InitialLogicalSizeCalculation,
@@ -82,56 +54,36 @@ pub(crate) enum BackgroundLoopKind {
SecondaryDownload,
}
pub struct BackgroundLoopSemaphorePermit<'a> {
_permit: SemaphorePermit<'static>,
_recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
impl BackgroundLoopKind {
fn as_static_str(&self) -> &'static str {
self.into()
}
}
/// Acquires a semaphore permit, to limit concurrent background jobs.
pub(crate) async fn acquire_concurrency_permit(
/// Cancellation safe.
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind: BackgroundLoopKind,
use_compaction_semaphore: bool,
_ctx: &RequestContext,
) -> BackgroundLoopSemaphorePermit<'static> {
// TODO: use a lower threshold and remove the pacer once we resolve some blockage.
const WARN_THRESHOLD: Duration = Duration::from_secs(600);
static WARN_PACER: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
) -> tokio::sync::SemaphorePermit<'static> {
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
pausable_failpoint!("initial-size-calculation-permit-pause");
}
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
CONCURRENT_COMPACTION_TASKS.acquire().await
} else {
assert!(!use_compaction_semaphore);
CONCURRENT_BACKGROUND_TASKS.acquire().await
}
.expect("should never close");
let waited = recorder.acquired();
if waited >= WARN_THRESHOLD {
let waited = waited.as_secs_f64();
WARN_PACER
.lock()
.unwrap()
.call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
}
BackgroundLoopSemaphorePermit {
_permit: permit,
_recorder: recorder,
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
Ok(permit) => permit,
Err(_closed) => unreachable!("we never close the semaphore"),
}
}
/// Start per tenant background loops: compaction, GC, and ingest housekeeping.
pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
/// Start per tenant background loops: compaction and gc.
pub fn start_background_loops(
tenant: &Arc<Tenant>,
background_jobs_can_start: Option<&completion::Barrier>,
) {
let tenant_shard_id = tenant.tenant_shard_id;
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Compaction,
@@ -140,15 +92,13 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
&format!("compactor for tenant {tenant_shard_id}"),
{
let tenant = Arc::clone(tenant);
let can_start = can_start.cloned();
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => return Ok(()),
_ = Barrier::maybe_wait(can_start) => {}
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
compaction_loop(tenant, cancel)
// If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -157,7 +107,6 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
}
},
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::GarbageCollector,
@@ -166,15 +115,13 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
&format!("garbage collector for tenant {tenant_shard_id}"),
{
let tenant = Arc::clone(tenant);
let can_start = can_start.cloned();
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => return Ok(()),
_ = Barrier::maybe_wait(can_start) => {}
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
gc_loop(tenant, cancel)
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.await;
@@ -185,23 +132,21 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::TenantHousekeeping,
TaskKind::IngestHousekeeping,
tenant_shard_id,
None,
&format!("housekeeping for tenant {tenant_shard_id}"),
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
{
let tenant = Arc::clone(tenant);
let can_start = can_start.cloned();
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => return Ok(()),
_ = Barrier::maybe_wait(can_start) => {}
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
tenant_housekeeping_loop(tenant, cancel)
.instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
ingest_housekeeping_loop(tenant, cancel)
.instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.await;
Ok(())
}
@@ -209,293 +154,372 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
);
}
/// Compaction task's main loop.
///
/// Compaction task's main loop
///
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
const MAX_BACKOFF_SECS: f64 = 300.0;
// How many errors we have seen consequtively
let mut error_run_count = 0;
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
let mut first = true;
let mut error_run = 0; // consecutive errors
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
let mut first = true;
loop {
tokio::select! {
_ = cancel.cancelled() => {
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(()) => (),
},
}
loop {
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
return;
}
let period = tenant.get_compaction_period();
let period = tenant.get_compaction_period();
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
// loop, #3501. There are also additional "allowed_errors" in tests.
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
}
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
// loop, #3501. There are also additional "allowed_errors" in tests.
if first {
first = false;
if sleep_random(period, &cancel).await.is_err() {
let sleep_duration;
if period == Duration::ZERO {
#[cfg(not(feature = "testing"))]
info!("automatic compaction is disabled");
// check again in 10 seconds, in case it's been enabled again.
sleep_duration = Duration::from_secs(10)
} else {
let iteration = Iteration {
started_at: Instant::now(),
period,
kind: BackgroundLoopKind::Compaction,
};
// Run compaction
let IterationResult { output, elapsed } = iteration
.run(tenant.compaction_iteration(&cancel, &ctx))
.await;
match output {
Ok(has_pending_task) => {
error_run_count = 0;
// schedule the next compaction immediately in case there is a pending compaction task
sleep_duration = if has_pending_task {
Duration::ZERO
} else {
period
};
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
log_compaction_error(
&e,
error_run_count,
&wait_duration,
cancel.is_cancelled(),
);
sleep_duration = wait_duration;
}
}
// the duration is recorded by performance tests by enabling debug in this function
tracing::debug!(
elapsed_ms = elapsed.as_millis(),
"compaction iteration complete"
);
};
// Perhaps we did no work and the walredo process has been idle for some time:
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
// TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
if let Some(walredo_mgr) = &tenant.walredo_mgr {
walredo_mgr.maybe_quiesce(period * 10);
}
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
.is_ok()
{
break;
}
}
let sleep_duration;
if period == Duration::ZERO {
#[cfg(not(feature = "testing"))]
info!("automatic compaction is disabled");
// check again in 10 seconds, in case it's been enabled again.
sleep_duration = Duration::from_secs(10)
} else {
let iteration = Iteration {
started_at: Instant::now(),
period,
kind: BackgroundLoopKind::Compaction,
};
// Run compaction
let IterationResult { output, elapsed } = iteration
.run(tenant.compaction_iteration(&cancel, &ctx))
.await;
match output {
Ok(outcome) => {
error_run = 0;
// schedule the next compaction immediately in case there is a pending compaction task
sleep_duration = if let CompactionOutcome::Pending = outcome {
Duration::from_secs(1)
} else {
period
};
}
Err(err) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
log_compaction_error(&err, error_run, &wait_duration, cancel.is_cancelled());
sleep_duration = wait_duration;
}
}
// the duration is recorded by performance tests by enabling debug in this function
debug!(
elapsed_ms = elapsed.as_millis(),
"compaction iteration complete"
);
};
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
.is_ok()
{
break;
}
}
.await;
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
fn log_compaction_error(
err: &CompactionError,
error_count: u32,
sleep_duration: &Duration,
e: &CompactionError,
error_run_count: u32,
sleep_duration: &std::time::Duration,
task_cancelled: bool,
) {
use crate::tenant::upload_queue::NotInitialized;
use crate::tenant::PageReconstructError;
use CompactionError::*;
let level = match err {
ShuttingDown => return,
Offload(_) => Level::ERROR,
_ if task_cancelled => Level::INFO,
Other(err) => {
let root_cause = err.root_cause();
enum LooksLike {
Info,
Error,
}
let upload_queue = root_cause
.downcast_ref::<NotInitialized>()
.is_some_and(|e| e.is_stopping());
let timeline = root_cause
.downcast_ref::<PageReconstructError>()
.is_some_and(|e| e.is_stopping());
let is_stopping = upload_queue || timeline;
let decision = match e {
ShuttingDown => None,
Offload(_) => Some(LooksLike::Error),
_ if task_cancelled => Some(LooksLike::Info),
Other(e) => {
let root_cause = e.root_cause();
let is_stopping = {
let upload_queue = root_cause
.downcast_ref::<NotInitialized>()
.is_some_and(|e| e.is_stopping());
let timeline = root_cause
.downcast_ref::<PageReconstructError>()
.is_some_and(|e| e.is_stopping());
upload_queue || timeline
};
if is_stopping {
Level::INFO
Some(LooksLike::Info)
} else {
Level::ERROR
Some(LooksLike::Error)
}
}
};
match level {
Level::ERROR => {
error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
}
Level::INFO => {
info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
}
level => unimplemented!("unexpected level {level:?}"),
match decision {
Some(LooksLike::Info) => info!(
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
),
Some(LooksLike::Error) => error!(
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
),
None => {}
}
}
/// GC task's main loop.
///
/// GC task's main loop
///
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
const MAX_BACKOFF_SECS: f64 = 300.0;
let mut error_run = 0; // consecutive errors
// How many errors we have seen consequtively
let mut error_run_count = 0;
// GC might require downloading, to find the cutoff LSN that corresponds to the
// cutoff specified as time.
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let mut first = true;
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
// GC might require downloading, to find the cutoff LSN that corresponds to the
// cutoff specified as time.
let ctx =
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
loop {
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
return;
}
let mut first = true;
loop {
tokio::select! {
_ = cancel.cancelled() => {
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(()) => (),
},
}
let period = tenant.get_gc_period();
let period = tenant.get_gc_period();
if first {
first = false;
if sleep_random(period, &cancel).await.is_err() {
if first {
first = false;
let delays = async {
random_init_delay(period, &cancel).await?;
Ok::<_, Cancelled>(())
};
if delays.await.is_err() {
break;
}
}
let gc_horizon = tenant.get_gc_horizon();
let sleep_duration;
if period == Duration::ZERO || gc_horizon == 0 {
#[cfg(not(feature = "testing"))]
info!("automatic GC is disabled");
// check again in 10 seconds, in case it's been enabled again.
sleep_duration = Duration::from_secs(10);
} else {
let iteration = Iteration {
started_at: Instant::now(),
period,
kind: BackgroundLoopKind::Gc,
};
// Run gc
let IterationResult { output, elapsed: _ } =
iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
.await;
match output {
Ok(_) => {
error_run_count = 0;
sleep_duration = period;
}
Err(crate::tenant::GcError::TenantCancelled) => {
return;
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
// Timeline was cancelled during gc. We might either be in an event
// that affects the entire tenant (tenant deletion, pageserver shutdown),
// or in one that affects the timeline only (timeline deletion).
// Therefore, don't exit the loop.
info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
} else {
error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
}
sleep_duration = wait_duration;
}
}
};
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
.is_ok()
{
break;
}
}
}
.await;
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let mut last_throttle_flag_reset_at = Instant::now();
loop {
tokio::select! {
_ = cancel.cancelled() => {
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(()) => (),
},
}
// We run ingest housekeeping with the same frequency as compaction: it is not worth
// having a distinct setting. But we don't run it in the same task, because compaction
// blocks on acquiring the background job semaphore.
let period = tenant.get_compaction_period();
// If compaction period is set to zero (to disable it), then we will use a reasonable default
let period = if period == Duration::ZERO {
humantime::Duration::from_str(
pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
)
.unwrap()
.into()
} else {
period
};
// Jitter the period by +/- 5%
let period =
rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
// Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
// a tenant, since it won't have started writing any ephemeral files yet.
if tokio::time::timeout(period, cancel.cancelled())
.await
.is_ok()
{
break;
}
let gc_horizon = tenant.get_gc_horizon();
let sleep_duration;
if period == Duration::ZERO || gc_horizon == 0 {
#[cfg(not(feature = "testing"))]
info!("automatic GC is disabled");
// check again in 10 seconds, in case it's been enabled again.
sleep_duration = Duration::from_secs(10);
} else {
let iteration = Iteration {
started_at: Instant::now(),
period,
kind: BackgroundLoopKind::Gc,
kind: BackgroundLoopKind::IngestHouseKeeping,
};
// Run gc
let IterationResult { output, elapsed: _ } = iteration
.run(tenant.gc_iteration(
None,
gc_horizon,
tenant.get_pitr_interval(),
&cancel,
&ctx,
))
.await;
match output {
Ok(_) => {
error_run = 0;
sleep_duration = period;
}
Err(crate::tenant::GcError::TenantCancelled) => {
iteration.run(tenant.ingest_housekeeping()).await;
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
// Or just spawn another background loop for this throttle, it's not like it's super costly.
info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
let now = Instant::now();
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
if count_throttled == 0 {
return;
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
let allowed_rps = tenant.pagestream_throttle.steady_rps();
let delta = now - prev;
info!(
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
count_accounted = count_accounted_finish, // don't break existing log scraping
count_throttled,
sum_throttled_usecs,
count_accounted_start, // log after pre-existing fields to not break existing log scraping
allowed_rps=%format_args!("{allowed_rps:.0}"),
"shard was throttled in the last n_seconds"
);
});
}
}
.await;
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
// Timeline was cancelled during gc. We might either be in an event
// that affects the entire tenant (tenant deletion, pageserver shutdown),
// or in one that affects the timeline only (timeline deletion).
// Therefore, don't exit the loop.
info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
} else {
error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
// if the tenant has a proper status already, no need to wait for anything
if tenant.current_state() == TenantState::Active {
ControlFlow::Continue(())
} else {
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
loop {
match tenant_state_updates.changed().await {
Ok(()) => {
let new_state = &*tenant_state_updates.borrow();
match new_state {
TenantState::Active => {
debug!("Tenant state changed to active, continuing the task loop");
return ControlFlow::Continue(());
}
state => {
debug!("Not running the task loop, tenant is not active: {state:?}");
continue;
}
}
sleep_duration = wait_duration;
}
Err(_sender_dropped_error) => {
return ControlFlow::Break(());
}
}
};
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
.is_ok()
{
break;
}
}
}
/// Tenant housekeeping's main loop.
async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let mut last_throttle_flag_reset_at = Instant::now();
loop {
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
return;
}
// Use the same period as compaction; it's not worth a separate setting. But if it's set to
// zero (to disable compaction), then use a reasonable default. Jitter it by 5%.
let period = match tenant.get_compaction_period() {
Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(),
period => period,
};
let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else {
break;
};
// Do tenant housekeeping.
let iteration = Iteration {
started_at: Instant::now(),
period,
kind: BackgroundLoopKind::TenantHouseKeeping,
};
iteration.run(tenant.housekeeping()).await;
// Log any getpage throttling.
info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
let now = Instant::now();
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
if count_throttled == 0 {
return;
}
let allowed_rps = tenant.pagestream_throttle.steady_rps();
let delta = now - prev;
info!(
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
count_accounted = count_accounted_finish, // don't break existing log scraping
count_throttled,
sum_throttled_usecs,
count_accounted_start, // log after pre-existing fields to not break existing log scraping
allowed_rps=%format_args!("{allowed_rps:.0}"),
"shard was throttled in the last n_seconds"
);
});
}
}
/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
async fn wait_for_active_tenant(
tenant: &Arc<Tenant>,
cancel: &CancellationToken,
) -> ControlFlow<()> {
if tenant.current_state() == TenantState::Active {
return ControlFlow::Continue(());
}
let mut update_rx = tenant.subscribe_for_state_updates();
loop {
tokio::select! {
_ = cancel.cancelled() => return ControlFlow::Break(()),
result = update_rx.changed() => if result.is_err() {
return ControlFlow::Break(());
}
}
match &*update_rx.borrow() {
TenantState::Active => {
debug!("Tenant state changed to active, continuing the task loop");
return ControlFlow::Continue(());
}
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
}
}
}
@@ -504,41 +528,26 @@ async fn wait_for_active_tenant(
#[error("cancelled")]
pub(crate) struct Cancelled;
/// Sleeps for a random interval up to the given max value.
/// Provide a random delay for background task initialization.
///
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
/// different periods for more stable load.
pub(crate) async fn sleep_random(
max: Duration,
pub(crate) async fn random_init_delay(
period: Duration,
cancel: &CancellationToken,
) -> Result<Duration, Cancelled> {
sleep_random_range(Duration::ZERO..=max, cancel).await
}
/// Sleeps for a random interval in the given range. Returns the duration.
pub(crate) async fn sleep_random_range(
interval: RangeInclusive<Duration>,
cancel: &CancellationToken,
) -> Result<Duration, Cancelled> {
let delay = rand::thread_rng().gen_range(interval);
if delay == Duration::ZERO {
return Ok(delay);
) -> Result<(), Cancelled> {
if period == Duration::ZERO {
return Ok(());
}
tokio::select! {
_ = cancel.cancelled() => Err(Cancelled),
_ = tokio::time::sleep(delay) => Ok(delay),
}
}
/// Sleeps for an interval with a random jitter.
pub(crate) async fn sleep_jitter(
duration: Duration,
jitter: Duration,
cancel: &CancellationToken,
) -> Result<Duration, Cancelled> {
let from = duration.saturating_sub(jitter);
let to = duration.saturating_add(jitter);
sleep_random_range(from..=to, cancel).await
let d = {
let mut rng = rand::thread_rng();
rng.gen_range(Duration::ZERO..=period)
};
match tokio::time::timeout(d, cancel.cancelled()).await {
Ok(_) => Err(Cancelled),
Err(_) => Ok(()),
}
}
struct Iteration {
@@ -554,25 +563,42 @@ struct IterationResult<O> {
impl Iteration {
#[instrument(skip_all)]
pub(crate) async fn run<F: Future<Output = O>, O>(self, fut: F) -> IterationResult<O> {
let mut fut = pin!(fut);
pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
where
Fut: std::future::Future<Output = O>,
{
let Self {
started_at,
period,
kind,
} = self;
let mut fut = std::pin::pin!(fut);
// Wrap `fut` into a future that logs a message every `period` so that we get a
// very obvious breadcrumb in the logs _while_ a slow iteration is happening.
let output = loop {
match tokio::time::timeout(self.period, &mut fut).await {
Ok(r) => break r,
Err(_) => info!("still running"),
let liveness_logger = async move {
loop {
match tokio::time::timeout(period, &mut fut).await {
Ok(x) => return x,
Err(_) => {
// info level as per the same rationale why warn_when_period_overrun is info
// => https://github.com/neondatabase/neon/pull/5724
info!("still running");
}
}
}
};
let elapsed = self.started_at.elapsed();
warn_when_period_overrun(elapsed, self.period, self.kind);
let output = liveness_logger.await;
let elapsed = started_at.elapsed();
warn_when_period_overrun(elapsed, period, kind);
IterationResult { output, elapsed }
}
}
// NB: the `task` and `period` are used for metrics labels.
/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
pub(crate) fn warn_when_period_overrun(
elapsed: Duration,
period: Duration,
@@ -590,7 +616,7 @@ pub(crate) fn warn_when_period_overrun(
"task iteration took longer than the configured period"
);
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
.with_label_values(&[task.into(), &format!("{}", period.as_secs())])
.with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
.inc();
}
}

View File

@@ -18,7 +18,6 @@ use arc_swap::{ArcSwap, ArcSwapOption};
use bytes::Bytes;
use camino::Utf8Path;
use chrono::{DateTime, Utc};
use compaction::CompactionOutcome;
use enumset::EnumSet;
use fail::fail_point;
use futures::{stream::FuturesUnordered, StreamExt};
@@ -52,7 +51,6 @@ use tokio::{
};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::critical;
use utils::rate_limit::RateLimit;
use utils::{
fs_ext,
@@ -191,19 +189,6 @@ pub enum ImageLayerCreationMode {
Initial,
}
#[derive(Clone, Debug, Default)]
pub enum LastImageLayerCreationStatus {
Incomplete {
/// The last key of the partition (exclusive) that was processed in the last
/// image layer creation attempt. We will continue from this key in the next
/// attempt.
last_key: Key,
},
Complete,
#[default]
Initial,
}
impl std::fmt::Display for ImageLayerCreationMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self)
@@ -362,8 +347,6 @@ pub struct Timeline {
// garbage collecting data that is still needed by the child timelines.
pub(crate) gc_info: std::sync::RwLock<GcInfo>,
pub(crate) last_image_layer_creation_status: ArcSwap<LastImageLayerCreationStatus>,
// It may change across major versions so for simplicity
// keep it after running initdb for a timeline.
// It is needed in checks when we want to error on some operations
@@ -532,9 +515,6 @@ impl GcInfo {
pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool {
self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes)
}
pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool {
self.leases.contains_key(&lsn)
}
}
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
@@ -956,16 +936,9 @@ pub(crate) enum ShutdownMode {
Hard,
}
enum ImageLayerCreationOutcome {
/// We generated an image layer
Generated {
unfinished_image_layer: ImageLayerWriter,
},
/// The key range is empty
Empty,
/// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip
/// the image layer creation.
Skip,
struct ImageLayerCreationOutcome {
unfinished_image_layer: Option<ImageLayerWriter>,
next_start_key: Key,
}
/// Public interface functions
@@ -1689,7 +1662,7 @@ impl Timeline {
cancel: &CancellationToken,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
) -> Result<bool, CompactionError> {
self.compact_with_options(
cancel,
CompactOptions {
@@ -1711,16 +1684,15 @@ impl Timeline {
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
) -> Result<bool, CompactionError> {
// most likely the cancellation token is from background task, but in tests it could be the
// request task as well.
let prepare = async move {
let guard = self.compaction_lock.lock().await;
let permit = super::tasks::acquire_concurrency_permit(
let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Compaction,
self.conf.use_compaction_semaphore,
ctx,
)
.await;
@@ -1732,8 +1704,8 @@ impl Timeline {
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
tuple = prepare => { tuple },
_ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
_ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
_ = self.cancel.cancelled() => return Ok(false),
_ = cancel.cancelled() => return Ok(false),
};
let last_record_lsn = self.get_last_record_lsn();
@@ -1741,13 +1713,13 @@ impl Timeline {
// Last record Lsn could be zero in case the timeline was just created
if !last_record_lsn.is_valid() {
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
return Ok(CompactionOutcome::Done);
return Ok(false);
}
let result = match self.get_compaction_algorithm_settings().kind {
CompactionAlgorithm::Tiered => {
self.compact_tiered(cancel, ctx).await?;
Ok(CompactionOutcome::Done)
Ok(false)
}
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
};
@@ -2377,18 +2349,6 @@ impl Timeline {
)
}
fn get_image_creation_preempt_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.image_creation_preempt_threshold
.unwrap_or(
self.conf
.default_tenant_conf
.image_creation_preempt_threshold,
)
}
/// Resolve the effective WAL receiver protocol to use for this tenant.
///
/// Priority order is:
@@ -2539,10 +2499,6 @@ impl Timeline {
gc_info: std::sync::RwLock::new(GcInfo::default()),
last_image_layer_creation_status: ArcSwap::new(Arc::new(
LastImageLayerCreationStatus::default(),
)),
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(),
@@ -2633,7 +2589,7 @@ impl Timeline {
return;
}
FlushLoopState::Exited => {
info!(
warn!(
"ignoring attempt to restart exited flush_loop {}/{}",
self.tenant_shard_id, self.timeline_id
);
@@ -3057,9 +3013,8 @@ impl Timeline {
let self_ref = &self;
let skip_concurrency_limiter = &skip_concurrency_limiter;
async move {
let wait_for_permit = super::tasks::acquire_concurrency_permit(
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::InitialLogicalSizeCalculation,
false,
background_ctx,
);
@@ -4087,20 +4042,15 @@ impl Timeline {
}
let mut layers_to_upload = Vec::new();
let (generated_image_layers, is_complete) = self
.create_image_layers(
layers_to_upload.extend(
self.create_image_layers(
&partitions,
self.initdb_lsn,
ImageLayerCreationMode::Initial,
ctx,
LastImageLayerCreationStatus::Initial,
)
.await?;
debug_assert!(
matches!(is_complete, LastImageLayerCreationStatus::Complete),
"init image generation mode must fully cover the keyspace"
.await?,
);
layers_to_upload.extend(generated_image_layers);
(layers_to_upload, None)
} else {
@@ -4357,7 +4307,7 @@ impl Timeline {
Ok(result)
}
// Is it time to create a new image layer for the given partition? True if we want to generate.
// Is it time to create a new image layer for the given partition?
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
let threshold = self.get_image_creation_threshold();
@@ -4420,6 +4370,7 @@ impl Timeline {
lsn: Lsn,
ctx: &RequestContext,
img_range: Range<Key>,
start: Key,
io_concurrency: IoConcurrency,
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
let mut wrote_keys = false;
@@ -4507,23 +4458,26 @@ impl Timeline {
lsn
},
);
Ok(ImageLayerCreationOutcome::Generated {
unfinished_image_layer: image_layer_writer,
Ok(ImageLayerCreationOutcome {
unfinished_image_layer: Some(image_layer_writer),
next_start_key: img_range.end,
})
} else {
// Special case: the image layer may be empty if this is a sharded tenant and the
// partition does not cover any keys owned by this shard. In this case, to ensure
// we don't leave gaps between image layers, leave `start` where it is, so that the next
// layer we write will cover the key range that we just scanned.
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
Ok(ImageLayerCreationOutcome::Empty)
Ok(ImageLayerCreationOutcome {
unfinished_image_layer: None,
next_start_key: start,
})
}
}
/// Create an image layer for metadata keys. This function produces one image layer for all metadata
/// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
/// would not be too large to fit in a single image layer.
///
/// Creating image layers for metadata keys are different from relational keys. Firstly, instead of
/// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse
/// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics
/// for metadata keys than relational keys, which is the number of delta files visited during the scan.
#[allow(clippy::too_many_arguments)]
async fn create_image_layer_for_metadata_keys(
self: &Arc<Self>,
@@ -4533,13 +4487,12 @@ impl Timeline {
ctx: &RequestContext,
img_range: Range<Key>,
mode: ImageLayerCreationMode,
start: Key,
io_concurrency: IoConcurrency,
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
// Metadata keys image layer creation.
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
let begin = Instant::now();
// Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
// not contain too many keys, otherwise this takes a lot of memory.
let data = self
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
.await?;
@@ -4564,7 +4517,10 @@ impl Timeline {
);
if !trigger_generation && mode == ImageLayerCreationMode::Try {
return Ok(ImageLayerCreationOutcome::Skip);
return Ok(ImageLayerCreationOutcome {
unfinished_image_layer: None,
next_start_key: img_range.end,
});
}
if self.cancel.is_cancelled() {
return Err(CreateImageLayersError::Cancelled);
@@ -4595,12 +4551,20 @@ impl Timeline {
lsn
}
);
Ok(ImageLayerCreationOutcome::Generated {
unfinished_image_layer: image_layer_writer,
Ok(ImageLayerCreationOutcome {
unfinished_image_layer: Some(image_layer_writer),
next_start_key: img_range.end,
})
} else {
// Special case: the image layer may be empty if this is a sharded tenant and the
// partition does not cover any keys owned by this shard. In this case, to ensure
// we don't leave gaps between image layers, leave `start` where it is, so that the next
// layer we write will cover the key range that we just scanned.
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
Ok(ImageLayerCreationOutcome::Empty)
Ok(ImageLayerCreationOutcome {
unfinished_image_layer: None,
next_start_key: start,
})
}
}
@@ -4656,8 +4620,6 @@ impl Timeline {
decision
}
/// Returns the image layers generated and an enum indicating whether the process is fully completed.
/// true = we have generate all image layers, false = we preempt the process for L0 compaction.
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
async fn create_image_layers(
self: &Arc<Timeline>,
@@ -4665,15 +4627,9 @@ impl Timeline {
lsn: Lsn,
mode: ImageLayerCreationMode,
ctx: &RequestContext,
last_status: LastImageLayerCreationStatus,
) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
let timer = self.metrics.create_images_time_histo.start_timer();
if partitioning.parts.is_empty() {
warn!("no partitions to create image layers for");
return Ok((vec![], LastImageLayerCreationStatus::Complete));
}
// We need to avoid holes between generated image layers.
// Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
// image layer with hole between them. In this case such layer can not be utilized by GC.
@@ -4685,65 +4641,15 @@ impl Timeline {
// image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.
let mut start = Key::MIN;
let check_for_image_layers =
if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
info!(
"resuming image layer creation: last_status=incomplete, continue from {}",
last_key
);
true
} else {
self.should_check_if_image_layers_required(lsn)
};
let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
let mut all_generated = true;
let mut partition_processed = 0;
let mut total_partitions = partitioning.parts.len();
let mut last_partition_processed = None;
let mut partition_parts = partitioning.parts.clone();
if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
// We need to skip the partitions that have already been processed.
let mut found = false;
for (i, partition) in partition_parts.iter().enumerate() {
if last_key <= partition.end().unwrap() {
// ```plain
// |------|--------|----------|------|
// ^last_key
// ^start from this partition
// ```
// Why `i+1` instead of `i`?
// It is possible that the user did some writes after the previous image layer creation attempt so that
// a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
// still want to skip this partition, so that we can make progress and avoid generating image layers over
// the same partition. Doing a mod to ensure we don't end up with an empty vec.
if i + 1 >= total_partitions {
// In general, this case should not happen -- if last_key is on the last partition, the previous
// iteration of image layer creation should return a complete status.
break; // with found=false
}
partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
total_partitions = partition_parts.len();
// Update the start key to the partition start.
start = partition_parts[0].start().unwrap();
found = true;
break;
}
}
if !found {
// Last key is within the last partition, or larger than all partitions.
return Ok((vec![], LastImageLayerCreationStatus::Complete));
}
}
for partition in partition_parts.iter() {
for partition in partitioning.parts.iter() {
if self.cancel.is_cancelled() {
return Err(CreateImageLayersError::Cancelled);
}
partition_processed += 1;
let img_range = start..partition.ranges.last().unwrap().end;
let compact_metadata = partition.overlaps(&Key::metadata_key_range());
if compact_metadata {
@@ -4778,8 +4684,6 @@ impl Timeline {
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
is_delta: false,
}) {
// TODO: this can be processed with the BatchLayerWriter::finish_with_discard
// in the future.
tracing::info!(
"Skipping image layer at {lsn} {}..{}, already exists",
img_range.start,
@@ -4813,13 +4717,17 @@ impl Timeline {
.map_err(|_| CreateImageLayersError::Cancelled)?,
);
let outcome = if !compact_metadata {
let ImageLayerCreationOutcome {
unfinished_image_layer,
next_start_key,
} = if !compact_metadata {
self.create_image_layer_for_rel_blocks(
partition,
image_layer_writer,
lsn,
ctx,
img_range.clone(),
start,
io_concurrency,
)
.await?
@@ -4831,58 +4739,18 @@ impl Timeline {
ctx,
img_range.clone(),
mode,
start,
io_concurrency,
)
.await?
};
match outcome {
ImageLayerCreationOutcome::Empty => {
// No data in this partition, so we don't need to create an image layer (for now).
// The next image layer should cover this key range, so we don't advance the `start`
// key.
}
ImageLayerCreationOutcome::Generated {
start = next_start_key;
if let Some(unfinished_image_layer) = unfinished_image_layer {
batch_image_writer.add_unfinished_image_writer(
unfinished_image_layer,
} => {
batch_image_writer.add_unfinished_image_writer(
unfinished_image_layer,
img_range.clone(),
lsn,
);
// The next image layer should be generated right after this one.
start = img_range.end;
}
ImageLayerCreationOutcome::Skip => {
// We don't need to create an image layer for this partition.
// The next image layer should NOT cover this range, otherwise
// the keyspace becomes empty (reads don't go past image layers).
start = img_range.end;
}
}
if let ImageLayerCreationMode::Try = mode {
// We have at least made some progress
if batch_image_writer.pending_layer_num() >= 1 {
// The `Try` mode is currently only used on the compaction path. We want to avoid
// image layer generation taking too long time and blocking L0 compaction. So in this
// mode, we also inspect the current number of L0 layers and skip image layer generation
// if there are too many of them.
let num_of_l0_layers = {
let layers = self.layers.read().await;
layers.layer_map()?.level0_deltas().len()
};
let image_preempt_threshold = self.get_image_creation_preempt_threshold()
* self.get_compaction_threshold();
if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
tracing::info!(
"preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
);
last_partition_processed = Some(partition.clone());
all_generated = false;
break;
}
}
img_range,
lsn,
);
}
}
@@ -4897,42 +4765,14 @@ impl Timeline {
.open_mut()?
.track_new_image_layers(&image_layers, &self.metrics);
drop_wlock(guard);
let duration = timer.stop_and_record();
timer.stop_and_record();
// Creating image layers may have caused some previously visible layers to be covered
if !image_layers.is_empty() {
self.update_layer_visibility().await?;
}
let total_layer_size = image_layers
.iter()
.map(|l| l.metadata().file_size)
.sum::<u64>();
info!(
"created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
image_layers.len(),
total_layer_size,
duration.as_secs_f64(),
partition_processed,
total_partitions
);
Ok((
image_layers,
if all_generated {
LastImageLayerCreationStatus::Complete
} else {
LastImageLayerCreationStatus::Incomplete {
last_key: if let Some(last_partition_processed) = last_partition_processed {
last_partition_processed.end().unwrap_or(Key::MIN)
} else {
// This branch should be unreachable, but in case it happens, we can just return the start key.
Key::MIN
},
}
},
))
Ok(image_layers)
}
/// Wait until the background initial logical size calculation is complete, or
@@ -5810,11 +5650,10 @@ impl Timeline {
let img = match res {
Ok(img) => img,
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
Err(walredo::Error::Other(err)) => {
critical!("walredo failure during page reconstruction: {err:?}");
Err(walredo::Error::Other(e)) => {
return Err(PageReconstructError::WalRedo(
err.context("reconstruct a page image"),
));
e.context("reconstruct a page image"),
))
}
};
Ok(img)

View File

@@ -10,8 +10,8 @@ use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
RecordedDuration, Timeline,
};
use anyhow::{anyhow, bail, Context};
@@ -26,7 +26,6 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use serde::Serialize;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::critical;
use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
@@ -34,7 +33,6 @@ use crate::page_cache;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::gc_block::GcBlock;
use crate::tenant::layer_map::LayerMap;
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::batch_split_writer::{
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -264,13 +262,13 @@ impl GcCompactionQueue {
ctx: &RequestContext,
gc_block: &GcBlock,
timeline: &Arc<Timeline>,
) -> Result<CompactionOutcome, CompactionError> {
) -> Result<bool, CompactionError> {
let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
let has_pending_tasks;
let (id, item) = {
let mut guard = self.inner.lock().unwrap();
let Some((id, item)) = guard.queued.pop_front() else {
return Ok(CompactionOutcome::Done);
return Ok(false);
};
guard.running = Some((id, item.clone()));
has_pending_tasks = !guard.queued.is_empty();
@@ -325,11 +323,7 @@ impl GcCompactionQueue {
let mut guard = self.inner.lock().unwrap();
guard.running = None;
}
Ok(if has_pending_tasks {
CompactionOutcome::Pending
} else {
CompactionOutcome::Done
})
Ok(has_pending_tasks)
}
#[allow(clippy::type_complexity)]
@@ -440,11 +434,6 @@ impl KeyHistoryRetention {
if dry_run {
return true;
}
if LayerMap::is_l0(&key.key_range, key.is_delta) {
// gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
// We should ignore such layers.
return true;
}
let layer_generation;
{
let guard = tline.layers.read().await;
@@ -600,17 +589,6 @@ impl CompactionStatistics {
}
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompactionOutcome {
#[default]
/// No layers need to be compacted after this round. Compaction doesn't need
/// to be immediately scheduled.
Done,
/// Still has pending layers to be compacted after this round. Ideally, the scheduler
/// should immediately schedule another compaction.
Pending,
}
impl Timeline {
/// TODO: cancellation
///
@@ -620,7 +598,7 @@ impl Timeline {
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
) -> Result<bool, CompactionError> {
if options
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
@@ -628,7 +606,7 @@ impl Timeline {
self.compact_with_gc(cancel, options, ctx)
.await
.map_err(CompactionError::Other)?;
return Ok(CompactionOutcome::Done);
return Ok(false);
}
if options.flags.contains(CompactFlags::DryRun) {
@@ -688,9 +666,9 @@ impl Timeline {
// Define partitioning schema if needed
// 1. L0 Compact
let l0_compaction_outcome = {
let fully_compacted = {
let timer = self.metrics.compact_time_histo.start_timer();
let l0_compaction_outcome = self
let fully_compacted = self
.compact_level0(
target_file_size,
options.flags.contains(CompactFlags::ForceL0Compaction),
@@ -698,15 +676,15 @@ impl Timeline {
)
.await?;
timer.stop_and_record();
l0_compaction_outcome
fully_compacted
};
if let CompactionOutcome::Pending = l0_compaction_outcome {
if !fully_compacted {
// Yield and do not do any other kind of compaction. True means
// that we have pending L0 compaction tasks and the compaction scheduler
// will prioritize compacting this tenant/timeline again.
info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
return Ok(CompactionOutcome::Pending);
return Ok(true);
}
// 2. Repartition and create image layers if necessary
@@ -731,7 +709,7 @@ impl Timeline {
.extend(sparse_partitioning.into_dense().parts);
// 3. Create new image layers for partitions that have been modified "enough".
let (image_layers, outcome) = self
let image_layers = self
.create_image_layers(
&partitioning,
lsn,
@@ -744,30 +722,10 @@ impl Timeline {
ImageLayerCreationMode::Try
},
&image_ctx,
self.last_image_layer_creation_status
.load()
.as_ref()
.clone(),
)
.await
.inspect_err(|err| {
if let CreateImageLayersError::GetVectoredError(
GetVectoredError::MissingKey(_),
) = err
{
critical!("missing key during compaction: {err:?}");
}
})?;
self.last_image_layer_creation_status
.store(Arc::new(outcome.clone()));
.await?;
self.upload_new_image_layers(image_layers)?;
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
// Yield and do not do any other kind of compaction.
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
return Ok(CompactionOutcome::Pending);
}
partitioning.parts.len()
}
Err(err) => {
@@ -795,7 +753,7 @@ impl Timeline {
self.compact_shard_ancestors(rewrite_max, ctx).await?;
}
Ok(CompactionOutcome::Done)
Ok(false)
}
/// Check for layers that are elegible to be rewritten:
@@ -1052,11 +1010,11 @@ impl Timeline {
target_file_size: u64,
force_compaction_ignore_threshold: bool,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
) -> Result<bool, CompactionError> {
let CompactLevel0Phase1Result {
new_layers,
deltas_to_compact,
outcome,
fully_compacted,
} = {
let phase1_span = info_span!("compact_level0_phase1");
let ctx = ctx.attached_child();
@@ -1085,12 +1043,12 @@ impl Timeline {
if new_layers.is_empty() && deltas_to_compact.is_empty() {
// nothing to do
return Ok(CompactionOutcome::Done);
return Ok(true);
}
self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
.await?;
Ok(outcome)
Ok(fully_compacted)
}
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
@@ -1632,11 +1590,7 @@ impl Timeline {
.into_iter()
.map(|x| x.drop_eviction_guard())
.collect::<Vec<_>>(),
outcome: if fully_compacted {
CompactionOutcome::Done
} else {
CompactionOutcome::Pending
},
fully_compacted,
})
}
}
@@ -1647,7 +1601,7 @@ struct CompactLevel0Phase1Result {
deltas_to_compact: Vec<Layer>,
// Whether we have included all L0 layers, or selected only part of them due to the
// L0 compaction size limit.
outcome: CompactionOutcome,
fully_compacted: bool,
}
#[derive(Default)]
@@ -3278,7 +3232,11 @@ impl TimelineAdaptor {
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
};
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
let outcome = self
let start = Key::MIN;
let ImageLayerCreationOutcome {
unfinished_image_layer,
next_start_key: _,
} = self
.timeline
.create_image_layer_for_rel_blocks(
&keyspace,
@@ -3286,15 +3244,13 @@ impl TimelineAdaptor {
lsn,
ctx,
key_range.clone(),
start,
IoConcurrency::sequential(),
)
.await?;
if let ImageLayerCreationOutcome::Generated {
unfinished_image_layer,
} = outcome
{
let (desc, path) = unfinished_image_layer.finish(ctx).await?;
if let Some(image_layer_writer) = unfinished_image_layer {
let (desc, path) = image_layer_writer.finish(ctx).await?;
let image_layer =
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
self.new_images.push(image_layer);

View File

@@ -341,13 +341,6 @@ impl DeleteTimelineFlow {
let tenant_shard_id = timeline.tenant_shard_id();
let timeline_id = timeline.timeline_id();
// Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest.
let Ok(tenant_guard) = tenant.gate.enter() else {
// It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion.
info!("Tenant is shutting down, timeline deletion will be resumed when it next starts");
return;
};
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
@@ -355,8 +348,6 @@ impl DeleteTimelineFlow {
Some(timeline_id),
"timeline_delete",
async move {
let _guard = tenant_guard;
if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
// Only log as an error if it's not a cancellation.
if matches!(err, DeleteTimelineError::Cancelled) {

View File

@@ -30,11 +30,8 @@ use crate::{
pgdatadir_mapping::CollectKeySpaceError,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
size::CalculateSyntheticSizeError,
storage_layer::LayerVisibilityHint,
tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit},
timeline::EvictionError,
LogicalSizeCalculationCause, Tenant,
size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
},
};
@@ -83,6 +80,8 @@ impl Timeline {
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
use crate::tenant::tasks::random_init_delay;
// acquire the gate guard only once within a useful span
let Ok(guard) = self.gate.enter() else {
return;
@@ -95,7 +94,7 @@ impl Timeline {
EvictionPolicy::OnlyImitiate(lat) => lat.period,
EvictionPolicy::NoEviction => Duration::from_secs(10),
};
if sleep_random(period, &self.cancel).await.is_err() {
if random_init_delay(period, &self.cancel).await.is_err() {
return;
}
}
@@ -331,10 +330,9 @@ impl Timeline {
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
let acquire_permit = crate::tenant::tasks::acquire_concurrency_permit(
) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Eviction,
false,
ctx,
);
@@ -376,7 +374,7 @@ impl Timeline {
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
gate: &GateGuard,
permit: BackgroundLoopSemaphorePermit<'static>,
permit: tokio::sync::SemaphorePermit<'static>,
ctx: &RequestContext,
) -> ControlFlow<()> {
if !self.tenant_shard_id.is_shard_zero() {

View File

@@ -39,7 +39,7 @@ use crate::{
use postgres_backend::is_expected_io_error;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
/// Status of the connection.
@@ -355,19 +355,6 @@ pub(super) async fn handle_walreceiver_connection(
// advances it to its end LSN. 0 is just an initialization placeholder.
let mut modification = timeline.begin_modification(Lsn(0));
async fn commit(
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
uncommitted: &mut u64,
) -> anyhow::Result<()> {
let stats = modification.stats();
modification.commit(ctx).await?;
WAL_INGEST.records_committed.inc_by(*uncommitted);
WAL_INGEST.inc_values_committed(&stats);
*uncommitted = 0;
Ok(())
}
if !records.is_empty() {
timeline
.metrics
@@ -379,7 +366,8 @@ pub(super) async fn handle_walreceiver_connection(
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
&& uncommitted_records > 0
{
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
modification.commit(&ctx).await?;
uncommitted_records = 0;
}
let local_next_record_lsn = interpreted.next_record_lsn;
@@ -393,13 +381,6 @@ pub(super) async fn handle_walreceiver_connection(
.await
.with_context(|| {
format!("could not ingest record at {local_next_record_lsn}")
})
.inspect_err(|err| {
// TODO: we can't differentiate cancellation errors with
// anyhow::Error, so just ignore it if we're cancelled.
if !cancellation.is_cancelled() {
critical!("{err:?}")
}
})?;
uncommitted_records += 1;
@@ -415,7 +396,8 @@ pub(super) async fn handle_walreceiver_connection(
|| modification.approx_pending_bytes()
> DatadirModification::MAX_PENDING_BYTES
{
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
modification.commit(&ctx).await?;
uncommitted_records = 0;
}
}
@@ -433,7 +415,7 @@ pub(super) async fn handle_walreceiver_connection(
if uncommitted_records > 0 || needs_last_record_lsn_advance {
// Commit any uncommitted records
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
modification.commit(&ctx).await?;
}
if !caught_up && streaming_lsn >= end_of_wal {
@@ -460,12 +442,10 @@ pub(super) async fn handle_walreceiver_connection(
filtered: &mut u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let stats = modification.stats();
modification.commit(ctx).await?;
WAL_INGEST
.records_committed
.inc_by(*uncommitted - *filtered);
WAL_INGEST.inc_values_committed(&stats);
modification.commit(ctx).await?;
*uncommitted = 0;
*filtered = 0;
Ok(())
@@ -527,13 +507,6 @@ pub(super) async fn handle_walreceiver_connection(
.await
.with_context(|| {
format!("could not ingest record at {next_record_lsn}")
})
.inspect_err(|err| {
// TODO: we can't differentiate cancellation errors with
// anyhow::Error, so just ignore it if we're cancelled.
if !cancellation.is_cancelled() {
critical!("{err:?}")
}
})?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");

View File

@@ -28,9 +28,17 @@ use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::fsm_logical_to_physical;
use postgres_ffi::walrecord::*;
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
use wal_decoder::models::*;
use anyhow::{bail, Result};
use bytes::{Buf, Bytes};
use tracing::*;
use utils::failpoint_support;
use utils::rate_limit::RateLimit;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
@@ -42,18 +50,11 @@ use crate::ZERO_PAGE;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::fsm_logical_to_physical;
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::walrecord::*;
use postgres_ffi::TransactionId;
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
use utils::bin_ser::SerializeError;
use utils::lsn::Lsn;
use utils::rate_limit::RateLimit;
use utils::{critical, failpoint_support};
use wal_decoder::models::*;
enum_pgversion! {CheckPoint, pgv::CheckPoint}
@@ -326,75 +327,93 @@ impl WalIngest {
let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
// VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
// its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
// as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
// https://github.com/neondatabase/neon/pull/10634.
// Sometimes, Postgres seems to create heap WAL records with the
// ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
// not set. In fact, it's possible that the VM page does not exist at all.
// In that case, we don't want to store a record to clear the VM bit;
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
//
// TODO: analyze the metrics and tighten this up accordingly. This logic
// implicitly assumes that VM pages see explicit WAL writes before
// implicit ClearVmBits, and will otherwise silently drop updates.
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
critical!("clear_vm_bits for unknown VM relation {vm_rel}");
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["relation"])
.inc();
return Ok(());
};
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["new_page"])
.inc();
new_vm_blk = None;
}
}
if let Some(blknum) = old_vm_blk {
if blknum >= vm_size {
critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["old_page"])
.inc();
old_vm_blk = None;
}
}
if new_vm_blk.is_none() && old_vm_blk.is_none() {
return Ok(());
} else if new_vm_blk == old_vm_blk {
// An UPDATE record that needs to clear the bits for both old and the new page, both of
// which reside on the same VM page.
self.put_rel_wal_record(
modification,
vm_rel,
new_vm_blk.unwrap(),
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno,
flags,
},
ctx,
)
.await?;
} else {
// Clear VM bits for one heap page, or for two pages that reside on different VM pages.
if let Some(new_vm_blk) = new_vm_blk {
if new_vm_blk.is_some() || old_vm_blk.is_some() {
if new_vm_blk == old_vm_blk {
// An UPDATE record that needs to clear the bits for both old and the
// new page, both of which reside on the same VM page.
self.put_rel_wal_record(
modification,
vm_rel,
new_vm_blk,
new_vm_blk.unwrap(),
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno: None,
flags,
},
ctx,
)
.await?;
}
if let Some(old_vm_blk) = old_vm_blk {
self.put_rel_wal_record(
modification,
vm_rel,
old_vm_blk,
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: None,
old_heap_blkno,
flags,
},
ctx,
)
.await?;
} else {
// Clear VM bits for one heap page, or for two pages that reside on
// different VM pages.
if let Some(new_vm_blk) = new_vm_blk {
self.put_rel_wal_record(
modification,
vm_rel,
new_vm_blk,
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno: None,
flags,
},
ctx,
)
.await?;
}
if let Some(old_vm_blk) = old_vm_blk {
self.put_rel_wal_record(
modification,
vm_rel,
old_vm_blk,
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: None,
old_heap_blkno,
flags,
},
ctx,
)
.await?;
}
}
}
Ok(())
}
@@ -1157,7 +1176,6 @@ impl WalIngest {
// See also the neon code changes in the InitWalRecovery() function.
if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
&& info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
&& self.shard.is_shard_zero()
{
let oldest_active_xid = if pg_version >= 17 {
let mut oldest_active_full_xid = cp.nextXid.value;

View File

@@ -79,14 +79,6 @@ impl WalRedoProcess {
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to

View File

@@ -509,44 +509,47 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
hash = get_hash_value(lfc_hash, &tag);
chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
LWLockAcquire(lfc_lock, LW_SHARED);
if (!LFC_ENABLED())
{
LWLockRelease(lfc_lock);
return 0;
}
while (true)
{
int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
if (entry != NULL)
int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
if (LFC_ENABLED())
{
for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
if (entry != NULL)
{
if ((entry->bitmap[chunk_offs >> 5] &
((uint32)1 << (chunk_offs & 31))) != 0)
for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
{
BITMAP_SET(bitmap, i);
found++;
if ((entry->bitmap[chunk_offs >> 5] &
((uint32)1 << (chunk_offs & 31))) != 0)
{
BITMAP_SET(bitmap, i);
found++;
}
}
}
else
{
i += this_chunk;
}
}
else
{
i += this_chunk;
LWLockRelease(lfc_lock);
return found;
}
/*
* Break out of the iteration before doing expensive stuff for
* a next iteration
*/
if (i >= nblocks)
if (i + 1 >= nblocks)
break;
/*
@@ -560,8 +563,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
LWLockRelease(lfc_lock);
#ifdef USE_ASSERT_CHECKING
{
#if USE_ASSERT_CHECKING
do {
int count = 0;
for (int j = 0; j < nblocks; j++)
@@ -571,7 +574,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
}
Assert(count == found);
}
} while (false);
#endif
return found;

View File

@@ -36,11 +36,6 @@
#include "pagestore_client.h"
#include "walproposer.h"
#ifdef __linux__
#include <sys/ioctl.h>
#include <linux/sockios.h>
#endif
#define PageStoreTrace DEBUG5
#define MIN_RECONNECT_INTERVAL_USEC 1000
@@ -733,36 +728,11 @@ retry:
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
{
int sndbuf = -1;
int recvbuf = -1;
#ifdef __linux__
int socketfd;
#endif
since_start = now;
INSTR_TIME_SUBTRACT(since_start, start_ts);
#ifdef __linux__
/*
* get kernel's send and recv queue size via ioctl
* https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
*/
socketfd = PQsocket(pageserver_conn);
if (socketfd != -1) {
int ioctl_err;
ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
if (ioctl_err!= 0) {
sndbuf = -errno;
}
ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
if (ioctl_err != 0) {
recvbuf = -errno;
}
}
#endif
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
INSTR_TIME_GET_DOUBLE(since_start),
shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
shard->nrequests_sent, shard->nresponses_received);
last_log_ts = now;
logged = true;
}

View File

@@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
{
uint64 min_ring_index;
PrefetchRequest hashkey;
#ifdef USE_ASSERT_CHECKING
#if USE_ASSERT_CHECKING
bool any_hits = false;
#endif
/* We will never read further ahead than our buffer can store. */
@@ -955,7 +955,7 @@ Retry:
else
lsns = NULL;
#ifdef USE_ASSERT_CHECKING
#if USE_ASSERT_CHECKING
any_hits = true;
#endif
@@ -3011,7 +3011,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
start_ts = GetCurrentTimestamp();
if (RecoveryInProgress() && MyBackendType != B_STARTUP)
XLogWaitForReplayOf(reqlsns->request_lsn);
XLogWaitForReplayOf(reqlsns[0].request_lsn);
/*
* Try to find prefetched page in the list of received pages.

View File

@@ -19,7 +19,6 @@ aws-config.workspace = true
aws-sdk-iam.workspace = true
aws-sigv4.workspace = true
base64.workspace = true
boxcar = "0.2.8"
bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
camino.workspace = true
@@ -43,7 +42,6 @@ hyper0.workspace = true
hyper = { workspace = true, features = ["server", "http1", "http2"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
gettid = "0.1.3"
indexmap = { workspace = true, features = ["serde"] }
ipnet.workspace = true
itertools.workspace = true
@@ -52,8 +50,6 @@ lasso = { workspace = true, features = ["multi-threaded"] }
measured = { workspace = true, features = ["lasso"] }
metrics.workspace = true
once_cell.workspace = true
opentelemetry = { workspace = true, features = ["trace"] }
papaya = "0.1.8"
parking_lot.workspace = true
parquet.workspace = true
parquet_derive.workspace = true
@@ -93,9 +89,6 @@ tokio = { workspace = true, features = ["signal"] }
tracing-subscriber.workspace = true
tracing-utils.workspace = true
tracing.workspace = true
tracing-log.workspace = true
tracing-serde.workspace = true
tracing-opentelemetry.workspace = true
try-lock.workspace = true
typed-json.workspace = true
url.workspace = true
@@ -119,7 +112,6 @@ rsa = "0.9"
workspace_hack.workspace = true
[dev-dependencies]
assert-json-diff.workspace = true
camino-tempfile.workspace = true
fallible-iterator.workspace = true
flate2.workspace = true

View File

@@ -37,8 +37,8 @@ To play with it locally one may start proxy over a local postgres installation
If both postgres and proxy are running you may send a SQL query:
```console
curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \
-H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \
curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
-H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
-H 'Content-Type: application/json' \
--data '{
"query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
@@ -104,9 +104,19 @@ cases where it is hard to use rows represented as objects (e.g. when several fie
## Test proxy locally
Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`.
Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
Let's create self-signed certificate by running:
```sh
openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
```
Then we need to build proxy with 'testing' feature and run, e.g.:
```sh
RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://proxy:password@endpoint.localtest.me:5432/postgres' --is-private-access-proxy true -c server.crt -k server.key
```
We will also need to have a postgres instance. Assuming that we have setted up docker we can set it up as follows:
```sh
docker run \
--detach \
@@ -123,18 +133,8 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
```
Let's create self-signed certificate by running:
```sh
openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
```
Then we need to build proxy with 'testing' feature and run, e.g.:
```sh
RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
```
Now from client you can start a new session:
```sh
PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full"
```
PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
```

View File

@@ -69,35 +69,17 @@ pub async fn handle_cancel_messages(
value,
resp_tx,
_guard,
expire,
expire: _,
} => {
let res = client.hset(&key, field, value).await;
if let Some(resp_tx) = resp_tx {
if res.is_ok() {
resp_tx
.send(client.expire(key, expire).await)
.inspect_err(|e| {
tracing::debug!(
"failed to send StoreCancelKey response: {:?}",
e
);
})
.ok();
} else {
resp_tx
.send(res)
.inspect_err(|e| {
tracing::debug!(
"failed to send StoreCancelKey response: {:?}",
e
);
})
.ok();
}
} else if res.is_ok() {
drop(client.expire(key, expire).await);
resp_tx
.send(client.hset(key, field, value).await)
.inspect_err(|e| {
tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
})
.ok();
} else {
tracing::warn!("failed to store cancel key: {:?}", res);
drop(client.hset(key, field, value).await);
}
}
CancelKeyOp::GetCancelData {
@@ -454,7 +436,7 @@ impl Session {
&self.key
}
// Send the store key op to the cancellation handler and set TTL for the key
// Send the store key op to the cancellation handler
pub(crate) async fn write_cancel_key(
&self,
cancel_closure: CancelClosure,

View File

@@ -1,23 +1,10 @@
use std::cell::{Cell, RefCell};
use std::collections::HashMap;
use std::hash::BuildHasher;
use std::{env, io};
use chrono::{DateTime, Utc};
use opentelemetry::trace::TraceContextExt;
use scopeguard::defer;
use serde::ser::{SerializeMap, Serializer};
use tracing::span;
use tracing::subscriber::Interest;
use tracing::{callsite, Event, Metadata, Span, Subscriber};
use tracing_opentelemetry::OpenTelemetrySpanExt;
use tracing::Subscriber;
use tracing_subscriber::filter::{EnvFilter, LevelFilter};
use tracing_subscriber::fmt::format::{Format, Full};
use tracing_subscriber::fmt::time::SystemTime;
use tracing_subscriber::fmt::{FormatEvent, FormatFields};
use tracing_subscriber::layer::{Context, Layer};
use tracing_subscriber::prelude::*;
use tracing_subscriber::registry::{LookupSpan, SpanRef};
use tracing_subscriber::registry::LookupSpan;
/// Initialize logging and OpenTelemetry tracing and exporter.
///
@@ -28,8 +15,6 @@ use tracing_subscriber::registry::{LookupSpan, SpanRef};
/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
pub async fn init() -> anyhow::Result<LoggingGuard> {
let logfmt = LogFormat::from_env()?;
let env_filter = EnvFilter::builder()
.with_default_directive(LevelFilter::INFO.into())
.from_env_lossy()
@@ -44,36 +29,17 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
.expect("this should be a valid filter directive"),
);
let fmt_layer = tracing_subscriber::fmt::layer()
.with_ansi(false)
.with_writer(std::io::stderr)
.with_target(false);
let otlp_layer = tracing_utils::init_tracing("proxy").await;
let json_log_layer = if logfmt == LogFormat::Json {
Some(JsonLoggingLayer {
clock: RealClock,
skipped_field_indices: papaya::HashMap::default(),
writer: StderrWriter {
stderr: std::io::stderr(),
},
})
} else {
None
};
let text_log_layer = if logfmt == LogFormat::Text {
Some(
tracing_subscriber::fmt::layer()
.with_ansi(false)
.with_writer(std::io::stderr)
.with_target(false),
)
} else {
None
};
tracing_subscriber::registry()
.with(env_filter)
.with(otlp_layer)
.with(json_log_layer)
.with(text_log_layer)
.with(fmt_layer)
.try_init()?;
Ok(LoggingGuard)
@@ -128,857 +94,3 @@ impl Drop for LoggingGuard {
tracing_utils::shutdown_tracing();
}
}
// TODO: make JSON the default
#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
enum LogFormat {
#[default]
Text = 1,
Json,
}
impl LogFormat {
fn from_env() -> anyhow::Result<Self> {
let logfmt = env::var("LOGFMT");
Ok(match logfmt.as_deref() {
Err(_) => LogFormat::default(),
Ok("text") => LogFormat::Text,
Ok("json") => LogFormat::Json,
Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"),
})
}
}
trait MakeWriter {
fn make_writer(&self) -> impl io::Write;
}
struct StderrWriter {
stderr: io::Stderr,
}
impl MakeWriter for StderrWriter {
#[inline]
fn make_writer(&self) -> impl io::Write {
self.stderr.lock()
}
}
// TODO: move into separate module or even separate crate.
trait Clock {
fn now(&self) -> DateTime<Utc>;
}
struct RealClock;
impl Clock for RealClock {
#[inline]
fn now(&self) -> DateTime<Utc> {
Utc::now()
}
}
/// Name of the field used by tracing crate to store the event message.
const MESSAGE_FIELD: &str = "message";
thread_local! {
/// Protects against deadlocks and double panics during log writing.
/// The current panic handler will use tracing to log panic information.
static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
/// Thread-local instance with per-thread buffer for log writing.
static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
/// Cached OS thread ID.
static THREAD_ID: u64 = gettid::gettid();
}
/// Implements tracing layer to handle events specific to logging.
struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
clock: C,
skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
writer: W,
}
impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
where
S: Subscriber + for<'a> LookupSpan<'a>,
{
fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) {
use std::io::Write;
// TODO: consider special tracing subscriber to grab timestamp very
// early, before OTel machinery, and add as event extension.
let now = self.clock.now();
let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
if entered.get() {
let mut formatter = EventFormatter::new();
formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
self.writer.make_writer().write_all(formatter.buffer())
} else {
entered.set(true);
defer!(entered.set(false););
EVENT_FORMATTER.with_borrow_mut(move |formatter| {
formatter.reset();
formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
self.writer.make_writer().write_all(formatter.buffer())
})
}
});
// In case logging fails we generate a simpler JSON object.
if let Err(err) = res {
if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
"timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
"level": "ERROR",
"message": format_args!("cannot log event: {err:?}"),
"fields": {
"event": format_args!("{event:?}"),
},
})) {
line.push(b'\n');
self.writer.make_writer().write_all(&line).ok();
}
}
}
/// Registers a SpanFields instance as span extension.
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("span must exist");
let fields = SpanFields::default();
fields.record_fields(attrs);
// This could deadlock when there's a panic somewhere in the tracing
// event handling and a read or write guard is still held. This includes
// the OTel subscriber.
span.extensions_mut().insert(fields);
}
fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("span must exist");
let ext = span.extensions();
if let Some(data) = ext.get::<SpanFields>() {
data.record_fields(values);
}
}
/// Called (lazily) whenever a new log call is executed. We quickly check
/// for duplicate field names and record duplicates as skippable. Last one
/// wins.
fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
if !metadata.is_event() {
// Must not be never because we wouldn't get trace and span data.
return Interest::always();
}
let mut field_indices = SkippedFieldIndices::default();
let mut seen_fields = HashMap::<&'static str, usize>::new();
for field in metadata.fields() {
use std::collections::hash_map::Entry;
match seen_fields.entry(field.name()) {
Entry::Vacant(entry) => {
// field not seen yet
entry.insert(field.index());
}
Entry::Occupied(mut entry) => {
// replace currently stored index
let old_index = entry.insert(field.index());
// ... and append it to list of skippable indices
field_indices.push(old_index);
}
}
}
if !field_indices.is_empty() {
self.skipped_field_indices
.pin()
.insert(metadata.callsite(), field_indices);
}
Interest::always()
}
}
/// Stores span field values recorded during the spans lifetime.
#[derive(Default)]
struct SpanFields {
// TODO: Switch to custom enum with lasso::Spur for Strings?
fields: papaya::HashMap<&'static str, serde_json::Value>,
}
impl SpanFields {
#[inline]
fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
fields.record(&mut SpanFieldsRecorder {
fields: self.fields.pin(),
});
}
}
/// Implements a tracing field visitor to convert and store values.
struct SpanFieldsRecorder<'m, S, G> {
fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
}
impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
if let Ok(value) = i64::try_from(value) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
} else {
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
#[inline]
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
if let Ok(value) = u64::try_from(value) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
} else {
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
#[inline]
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value:?}")));
}
#[inline]
fn record_error(
&mut self,
field: &tracing::field::Field,
value: &(dyn std::error::Error + 'static),
) {
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
/// List of field indices skipped during logging. Can list duplicate fields or
/// metafields not meant to be logged.
#[derive(Clone, Default)]
struct SkippedFieldIndices {
bits: u64,
}
impl SkippedFieldIndices {
#[inline]
fn is_empty(&self) -> bool {
self.bits == 0
}
#[inline]
fn push(&mut self, index: usize) {
self.bits |= 1u64
.checked_shl(index as u32)
.expect("field index too large");
}
#[inline]
fn contains(&self, index: usize) -> bool {
self.bits
& 1u64
.checked_shl(index as u32)
.expect("field index too large")
!= 0
}
}
/// Formats a tracing event and writes JSON to its internal buffer including a newline.
// TODO: buffer capacity management, truncate if too large
struct EventFormatter {
logline_buffer: Vec<u8>,
}
impl EventFormatter {
#[inline]
fn new() -> Self {
EventFormatter {
logline_buffer: Vec::new(),
}
}
#[inline]
fn buffer(&self) -> &[u8] {
&self.logline_buffer
}
#[inline]
fn reset(&mut self) {
self.logline_buffer.clear();
}
fn format<S>(
&mut self,
now: DateTime<Utc>,
event: &Event<'_>,
ctx: &Context<'_, S>,
skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
) -> io::Result<()>
where
S: Subscriber + for<'a> LookupSpan<'a>,
{
let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
use tracing_log::NormalizeEvent;
let normalized_meta = event.normalized_metadata();
let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
let skipped_field_indices = skipped_field_indices.pin();
let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
let mut serialize = || {
let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
let mut serializer = serializer.serialize_map(None)?;
// Timestamp comes first, so raw lines can be sorted by timestamp.
serializer.serialize_entry("timestamp", &timestamp)?;
// Level next.
serializer.serialize_entry("level", &meta.level().as_str())?;
// Message next.
serializer.serialize_key("message")?;
let mut message_extractor =
MessageFieldExtractor::new(serializer, skipped_field_indices);
event.record(&mut message_extractor);
let mut serializer = message_extractor.into_serializer()?;
let mut fields_present = FieldsPresent(false, skipped_field_indices);
event.record(&mut fields_present);
if fields_present.0 {
serializer.serialize_entry(
"fields",
&SerializableEventFields(event, skipped_field_indices),
)?;
}
let pid = std::process::id();
if pid != 1 {
serializer.serialize_entry("process_id", &pid)?;
}
THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
// TODO: tls cache? name could change
if let Some(thread_name) = std::thread::current().name() {
if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
serializer.serialize_entry("thread_name", thread_name)?;
}
}
if let Some(task_id) = tokio::task::try_id() {
serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
}
serializer.serialize_entry("target", meta.target())?;
if let Some(module) = meta.module_path() {
if module != meta.target() {
serializer.serialize_entry("module", module)?;
}
}
if let Some(file) = meta.file() {
if let Some(line) = meta.line() {
serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
} else {
serializer.serialize_entry("src", file)?;
}
}
{
let otel_context = Span::current().context();
let otel_spanref = otel_context.span();
let span_context = otel_spanref.span_context();
if span_context.is_valid() {
serializer.serialize_entry(
"trace_id",
&format_args!("{}", span_context.trace_id()),
)?;
}
}
serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
serializer.end()
};
serialize().map_err(io::Error::other)?;
self.logline_buffer.push(b'\n');
Ok(())
}
}
/// Extracts the message field that's mixed will other fields.
struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
serializer: S,
skipped_field_indices: Option<&'a SkippedFieldIndices>,
state: Option<Result<(), S::Error>>,
}
impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
#[inline]
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
Self {
serializer,
skipped_field_indices,
state: None,
}
}
#[inline]
fn into_serializer(mut self) -> Result<S, S::Error> {
match self.state {
Some(Ok(())) => {}
Some(Err(err)) => return Err(err),
None => self.serializer.serialize_value("")?,
}
Ok(self.serializer)
}
#[inline]
fn accept_field(&self, field: &tracing::field::Field) -> bool {
self.state.is_none()
&& field.name() == MESSAGE_FIELD
&& !self
.skipped_field_indices
.is_some_and(|i| i.contains(field.index()))
}
}
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&value));
}
}
#[inline]
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&value));
}
}
#[inline]
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&value));
}
}
#[inline]
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&value));
}
}
#[inline]
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&value));
}
}
#[inline]
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&value));
}
}
#[inline]
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
}
}
#[inline]
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&value));
}
}
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
}
}
#[inline]
fn record_error(
&mut self,
field: &tracing::field::Field,
value: &(dyn std::error::Error + 'static),
) {
if self.accept_field(field) {
self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
}
}
}
/// Checks if there's any fields and field values present. If not, the JSON subobject
/// can be skipped.
// This is entirely optional and only cosmetic, though maybe helps a
// bit during log parsing in dashboards when there's no field with empty object.
struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
// Even though some methods have an overhead (error, bytes) it is assumed the
// compiler won't include this since we ignore the value entirely.
impl tracing::field::Visit for FieldsPresent<'_> {
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
if !self.1.is_some_and(|i| i.contains(field.index()))
&& field.name() != MESSAGE_FIELD
&& !field.name().starts_with("log.")
{
self.0 |= true;
}
}
}
/// Serializes the fields directly supplied with a log event.
struct SerializableEventFields<'a, 'event>(
&'a tracing::Event<'event>,
Option<&'a SkippedFieldIndices>,
);
impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
use serde::ser::SerializeMap;
let serializer = serializer.serialize_map(None)?;
let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
self.0.record(&mut message_skipper);
let serializer = message_skipper.into_serializer()?;
serializer.end()
}
}
/// A tracing field visitor that skips the message field.
struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
serializer: S,
skipped_field_indices: Option<&'a SkippedFieldIndices>,
state: Result<(), S::Error>,
}
impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
#[inline]
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
Self {
serializer,
skipped_field_indices,
state: Ok(()),
}
}
#[inline]
fn accept_field(&self, field: &tracing::field::Field) -> bool {
self.state.is_ok()
&& field.name() != MESSAGE_FIELD
&& !field.name().starts_with("log.")
&& !self
.skipped_field_indices
.is_some_and(|i| i.contains(field.index()))
}
#[inline]
fn into_serializer(self) -> Result<S, S::Error> {
self.state?;
Ok(self.serializer)
}
}
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
if self.accept_field(field) {
self.state = self.serializer.serialize_entry(field.name(), &value);
}
}
#[inline]
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
if self.accept_field(field) {
self.state = self.serializer.serialize_entry(field.name(), &value);
}
}
#[inline]
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
if self.accept_field(field) {
self.state = self.serializer.serialize_entry(field.name(), &value);
}
}
#[inline]
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
if self.accept_field(field) {
self.state = self.serializer.serialize_entry(field.name(), &value);
}
}
#[inline]
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
if self.accept_field(field) {
self.state = self.serializer.serialize_entry(field.name(), &value);
}
}
#[inline]
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
if self.accept_field(field) {
self.state = self.serializer.serialize_entry(field.name(), &value);
}
}
#[inline]
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
if self.accept_field(field) {
self.state = self
.serializer
.serialize_entry(field.name(), &format_args!("{value:x?}"));
}
}
#[inline]
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
if self.accept_field(field) {
self.state = self.serializer.serialize_entry(field.name(), &value);
}
}
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
if self.accept_field(field) {
self.state = self
.serializer
.serialize_entry(field.name(), &format_args!("{value:?}"));
}
}
#[inline]
fn record_error(
&mut self,
field: &tracing::field::Field,
value: &(dyn std::error::Error + 'static),
) {
if self.accept_field(field) {
self.state = self.serializer.serialize_value(&format_args!("{value}"));
}
}
}
/// Serializes the span stack from root to leaf (parent of event) enumerated
/// inside an object where the keys are just the number padded with zeroes
/// to retain sorting order.
// The object is necessary because Loki cannot flatten arrays.
struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
where
Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
where
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
{
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
where
Ser: serde::ser::Serializer,
{
let mut serializer = serializer.serialize_map(None)?;
if let Some(leaf_span) = self.0.lookup_current() {
for (i, span) in leaf_span.scope().from_root().enumerate() {
serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
}
}
serializer.end()
}
}
/// Serializes a single span. Include the span ID, name and its fields as
/// recorded up to this point.
struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
where
Span: for<'lookup> LookupSpan<'lookup>;
impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
where
Span: for<'lookup> LookupSpan<'lookup>,
{
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
where
Ser: serde::ser::Serializer,
{
let mut serializer = serializer.serialize_map(None)?;
// TODO: the span ID is probably only useful for debugging tracing.
serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
serializer.serialize_entry("span_name", self.0.metadata().name())?;
let ext = self.0.extensions();
if let Some(data) = ext.get::<SpanFields>() {
for (key, value) in &data.fields.pin() {
serializer.serialize_entry(key, value)?;
}
}
serializer.end()
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use std::sync::{Arc, Mutex, MutexGuard};
use assert_json_diff::assert_json_eq;
use tracing::info_span;
use super::*;
struct TestClock {
current_time: Mutex<DateTime<Utc>>,
}
impl Clock for Arc<TestClock> {
fn now(&self) -> DateTime<Utc> {
*self.current_time.lock().expect("poisoned")
}
}
struct VecWriter<'a> {
buffer: MutexGuard<'a, Vec<u8>>,
}
impl MakeWriter for Arc<Mutex<Vec<u8>>> {
fn make_writer(&self) -> impl io::Write {
VecWriter {
buffer: self.lock().expect("poisoned"),
}
}
}
impl io::Write for VecWriter<'_> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.buffer.write(buf)
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
#[test]
fn test_field_collection() {
let clock = Arc::new(TestClock {
current_time: Mutex::new(Utc::now()),
});
let buffer = Arc::new(Mutex::new(Vec::new()));
let log_layer = JsonLoggingLayer {
clock: clock.clone(),
skipped_field_indices: papaya::HashMap::default(),
writer: buffer.clone(),
};
let registry = tracing_subscriber::Registry::default().with(log_layer);
tracing::subscriber::with_default(registry, || {
info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
info_span!("span2").in_scope(|| {
tracing::error!(
a = 1,
a = 2,
a = 3,
message = "explicit message field",
"implicit message field"
);
});
});
});
let buffer = Arc::try_unwrap(buffer)
.expect("no other reference")
.into_inner()
.expect("poisoned");
let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON");
let expected: serde_json::Value = serde_json::json!(
{
"timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
"level": "ERROR",
"message": "explicit message field",
"fields": {
"a": 3,
},
"spans": {
"00":{
"span_id": "0000000000000001",
"span_name": "span1",
"x": 42,
},
"01": {
"span_id": "0000000000000002",
"span_name": "span2",
}
},
"src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
"target": "proxy::logging::tests",
"process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
"thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(),
"thread_name": "logging::tests::test_field_collection",
}
);
assert_json_eq!(actual, expected);
}
}

View File

@@ -15,8 +15,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
use tokio::sync::mpsc::error::SendError;
use tokio::task::JoinHandle;
use tokio::time::MissedTickBehavior;
use tracing::{error, info, info_span, Instrument};
use utils::critical;
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use utils::postgres_client::Compression;
use utils::postgres_client::InterpretedFormat;
@@ -121,20 +120,6 @@ pub enum InterpretedWalReaderError {
WalStreamClosed,
}
enum CurrentPositionUpdate {
Reset(Lsn),
NotReset(Lsn),
}
impl CurrentPositionUpdate {
fn current_position(&self) -> Lsn {
match self {
CurrentPositionUpdate::Reset(lsn) => *lsn,
CurrentPositionUpdate::NotReset(lsn) => *lsn,
}
}
}
impl InterpretedWalReaderState {
fn current_position(&self) -> Option<Lsn> {
match self {
@@ -144,26 +129,6 @@ impl InterpretedWalReaderState {
InterpretedWalReaderState::Done => None,
}
}
// Reset the current position of the WAL reader if the requested starting position
// of the new shard is smaller than the current value.
fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate {
match self {
InterpretedWalReaderState::Running {
current_position, ..
} => {
if new_shard_start_pos < *current_position {
*current_position = new_shard_start_pos;
CurrentPositionUpdate::Reset(*current_position)
} else {
CurrentPositionUpdate::NotReset(*current_position)
}
}
InterpretedWalReaderState::Done => {
panic!("maybe_reset called on finished reader")
}
}
}
}
pub(crate) struct AttachShardNotification {
@@ -214,10 +179,11 @@ impl InterpretedWalReader {
metric.dec();
}
reader
.run_impl(start_pos)
.await
.inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
let res = reader.run_impl(start_pos).await;
if let Err(ref err) = res {
tracing::error!("Task finished with error: {err}");
}
res
}
.instrument(info_span!("interpreted wal reader")),
);
@@ -273,10 +239,11 @@ impl InterpretedWalReader {
metric.dec();
}
if let Err(err) = self.run_impl(start_pos).await {
critical!("failed to read WAL record: {err:?}");
let res = self.run_impl(start_pos).await;
if let Err(err) = res {
tracing::error!("Interpreted wal reader encountered error: {err}");
} else {
info!("interpreted wal reader exiting");
tracing::info!("Interpreted wal reader exiting");
}
Err(CopyStreamHandlerEnd::Other(anyhow!(
@@ -443,24 +410,15 @@ impl InterpretedWalReader {
};
senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
// If the shard is subscribing below the current position the we need
// to update the cursor that tracks where we are at in the WAL
// ([`Self::state`]) and reset the WAL stream itself
// (`[Self::wal_stream`]). This must be done atomically from the POV of
// anything outside the select statement.
let position_reset = self.state.write().unwrap().maybe_reset(start_pos);
match position_reset {
CurrentPositionUpdate::Reset(to) => {
self.wal_stream.reset(to).await;
wal_decoder = WalStreamDecoder::new(to, self.pg_version);
},
CurrentPositionUpdate::NotReset(_) => {}
};
let current_pos = self.state.read().unwrap().current_position().unwrap();
if start_pos < current_pos {
self.wal_stream.reset(start_pos).await;
wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
}
tracing::info!(
"Added shard sender {} with start_pos={} current_pos={}",
ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position()
ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
);
}
}
@@ -626,7 +584,7 @@ mod tests {
.unwrap();
let resident_tli = tli.wal_residence_guard().await.unwrap();
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
.await
.unwrap();
let end_pos = end_watch.get();
@@ -757,6 +715,7 @@ mod tests {
const MSG_COUNT: usize = 200;
const PG_VERSION: u32 = 17;
const SHARD_COUNT: u8 = 2;
const ATTACHED_SHARDS: u8 = 4;
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
let env = Env::new(true).unwrap();
@@ -766,11 +725,9 @@ mod tests {
.unwrap();
let resident_tli = tli.wal_residence_guard().await.unwrap();
let mut next_record_lsns = Vec::default();
let end_watch =
Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
.await
.unwrap();
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
.await
.unwrap();
let end_pos = end_watch.get();
let streaming_wal_reader = StreamingWalReader::new(
@@ -789,71 +746,38 @@ mod tests {
)
.unwrap();
struct Sender {
tx: Option<tokio::sync::mpsc::Sender<Batch>>,
rx: tokio::sync::mpsc::Receiver<Batch>,
shard: ShardIdentity,
start_lsn: Lsn,
received_next_record_lsns: Vec<Lsn>,
}
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
let mut batch_receivers = vec![rx];
impl Sender {
fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self {
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
Self {
tx: Some(tx),
rx,
shard,
start_lsn,
received_next_record_lsns: Vec::default(),
}
}
}
assert!(next_record_lsns.len() > 7);
let start_lsns = vec![
next_record_lsns[5],
next_record_lsns[1],
next_record_lsns[3],
];
let mut senders = start_lsns
.into_iter()
.map(|lsn| Sender::new(lsn, shard_0))
.collect::<Vec<_>>();
let first_sender = senders.first_mut().unwrap();
let handle = InterpretedWalReader::spawn(
streaming_wal_reader,
first_sender.start_lsn,
first_sender.tx.take().unwrap(),
first_sender.shard,
start_lsn,
tx,
shard_0,
PG_VERSION,
&Some("pageserver".to_string()),
);
for sender in senders.iter_mut().skip(1) {
handle
.fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn)
.unwrap();
for _ in 0..(ATTACHED_SHARDS - 1) {
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
handle.fanout(shard_0, tx, start_lsn).unwrap();
batch_receivers.push(rx);
}
for sender in senders.iter_mut() {
loop {
let batch = sender.rx.recv().await.unwrap();
tracing::info!(
"Sender with start_lsn={} received batch ending at {} with {} records",
sender.start_lsn,
batch.wal_end_lsn,
batch.records.records.len()
loop {
let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
for rx in batch_receivers.iter_mut().skip(1) {
let other_batch = rx.recv().await.unwrap();
assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
assert_eq!(
batch.available_wal_end_lsn,
other_batch.available_wal_end_lsn
);
}
for rec in batch.records.records {
sender.received_next_record_lsns.push(rec.next_record_lsn);
}
if batch.wal_end_lsn == batch.available_wal_end_lsn {
break;
}
if batch.wal_end_lsn == batch.available_wal_end_lsn {
break;
}
}
@@ -868,20 +792,5 @@ mod tests {
}
assert!(done);
for sender in senders {
tracing::info!(
"Validating records received by sender with start_lsn={}",
sender.start_lsn
);
assert!(sender.received_next_record_lsns.is_sorted());
let expected = next_record_lsns
.iter()
.filter(|lsn| **lsn > sender.start_lsn)
.copied()
.collect::<Vec<_>>();
assert_eq!(sender.received_next_record_lsns, expected);
}
}
}

View File

@@ -122,7 +122,6 @@ impl Env {
start_lsn: Lsn,
msg_size: usize,
msg_count: usize,
mut next_record_lsns: Option<&mut Vec<Lsn>>,
) -> anyhow::Result<EndWatch> {
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
@@ -131,7 +130,7 @@ impl Env {
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
let prefix = c"neon-file:";
let prefix = c"p";
let prefixlen = prefix.to_bytes_with_nul().len();
assert!(msg_size >= prefixlen);
let message = vec![0; msg_size - prefixlen];
@@ -140,9 +139,6 @@ impl Env {
&mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
for _ in 0..msg_count {
let (lsn, record) = walgen.next().unwrap();
if let Some(ref mut lsns) = next_record_lsns {
lsns.push(lsn);
}
let req = AppendRequest {
h: AppendRequestHeader {

View File

@@ -592,8 +592,6 @@ impl Timeline {
assert!(self.cancel.is_cancelled());
assert!(self.gate.close_complete());
info!("deleting timeline {} from disk", self.ttid);
// Close associated FDs. Nobody will be able to touch timeline data once
// it is cancelled, so WAL storage won't be opened again.
shared_state.sk.close_wal_store();

View File

@@ -475,8 +475,6 @@ impl GlobalTimelines {
info!("deleting timeline {}, only_local={}", ttid, only_local);
timeline.shutdown().await;
info!("timeline {ttid} shut down for deletion");
// Take a lock and finish the deletion holding this mutex.
let mut shared_state = timeline.write_shared_state().await;

View File

@@ -535,10 +535,6 @@ pub async fn main_task(
// limit concurrent uploads
let _upload_permit = tokio::select! {
acq = limiter.acquire_partial_backup() => acq,
_ = backup.tli.cancel.cancelled() => {
info!("timeline canceled");
return None;
}
_ = cancel.cancelled() => {
info!("task canceled");
return None;

View File

@@ -246,7 +246,7 @@ mod tests {
.unwrap();
let resident_tli = tli.wal_residence_guard().await.unwrap();
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
.await
.unwrap();
let end_pos = end_watch.get();

View File

@@ -32,7 +32,6 @@ CREATE TABLE IF NOT EXISTS results (
flaky BOOLEAN NOT NULL,
arch arch DEFAULT 'X64',
lfc BOOLEAN DEFAULT false NOT NULL,
sanitizers BOOLEAN DEFAULT false NOT NULL,
build_type TEXT NOT NULL,
pg_version INT NOT NULL,
run_id BIGINT NOT NULL,
@@ -40,7 +39,7 @@ CREATE TABLE IF NOT EXISTS results (
reference TEXT NOT NULL,
revision CHAR(40) NOT NULL,
raw JSONB COMPRESSION lz4 NOT NULL,
UNIQUE (parent_suite, suite, name, arch, lfc, sanitizers, build_type, pg_version, started_at, stopped_at, run_id)
UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
);
"""
@@ -57,7 +56,6 @@ class Row:
flaky: bool
arch: str
lfc: bool
sanitizers: bool
build_type: str
pg_version: int
run_id: int
@@ -137,7 +135,6 @@ def ingest_test_result(
}
arch = parameters.get("arch", "UNKNOWN").strip("'")
lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc"
sanitizers = parameters.get("sanitizers", "disabled").strip("'") == "enabled"
build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
labels = {label["name"]: label["value"] for label in test["labels"]}
@@ -152,7 +149,6 @@ def ingest_test_result(
flaky=test["flaky"] or test["retriesStatusChange"],
arch=arch,
lfc=lfc,
sanitizers=sanitizers,
build_type=build_type,
pg_version=pg_version,
run_id=run_id,

View File

@@ -32,7 +32,6 @@ postgres_connection.workspace = true
rand.workspace = true
reqwest = { workspace = true, features = ["stream"] }
routerify.workspace = true
rustls-native-certs.workspace = true
serde.workspace = true
serde_json.workspace = true
thiserror.workspace = true
@@ -40,20 +39,18 @@ tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
measured.workspace = true
rustls.workspace = true
scopeguard.workspace = true
strum.workspace = true
strum_macros.workspace = true
tokio-postgres.workspace = true
tokio-postgres-rustls.workspace = true
diesel = { version = "2.2.6", features = [
"serde_json",
"postgres",
"r2d2",
"chrono",
] }
diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
diesel_migrations = { version = "2.2.0" }
scoped-futures = "0.1.4"
r2d2 = { version = "0.8.10" }
utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }

View File

@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
// We shutdown while sending
#[error("Shutting down")]
ShuttingDown,
// A response indicates we will never succeed, such as 400 or 403
// A response indicates we will never succeed, such as 400 or 404
#[error("Non-retryable error {0}")]
Fatal(StatusCode),

View File

@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
// Validate that we can connect to the database
Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
let persistence = Arc::new(Persistence::new(secrets.database_url).await);
let persistence = Arc::new(Persistence::new(secrets.database_url));
let service = Service::spawn(config, persistence.clone()).await?;

File diff suppressed because it is too large Load Diff

View File

@@ -115,15 +115,6 @@ impl ReconcilerConfigBuilder {
}
}
pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
Self {
config: ReconcilerConfig {
tenant_creation_hint: hint,
..self.config
},
}
}
pub(crate) fn build(self) -> ReconcilerConfig {
self.config
}
@@ -138,10 +129,6 @@ pub(crate) struct ReconcilerConfig {
// During live migrations this is the amount of time that
// the pagserver will hold our poll.
secondary_download_request_timeout: Option<Duration>,
// A hint indicating whether this reconciliation is done on the
// creation of a new tenant. This only informs logging behaviour.
tenant_creation_hint: bool,
}
impl ReconcilerConfig {
@@ -156,10 +143,6 @@ impl ReconcilerConfig {
self.secondary_download_request_timeout
.unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
}
pub(crate) fn tenant_creation_hint(&self) -> bool {
self.tenant_creation_hint
}
}
/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
@@ -951,35 +934,16 @@ impl Reconciler {
)
.await;
if let Err(e) = &result {
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
// needs to retry at some point.
self.compute_notify_failure = true;
// It is up to the caller whether they want to drop out on this error, but they don't have to:
// in general we should avoid letting unavailability of the cloud control plane stop us from
// making progress.
match e {
// 404s from cplane during tenant creation are expected.
// Cplane only persists the shards to the database after
// creating the tenant and the timeline. If we notify before
// that, we'll get a 404.
//
// This is fine because tenant creations happen via /location_config
// and that returns the list of locations in the response. Hence, we
// silence the error and return Ok(()) here. Reconciliation will still
// be retried because we set [`Reconciler::compute_notify_failure`] above.
NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
if self.reconciler_config.tenant_creation_hint() =>
{
return Ok(());
}
NotifyError::ShuttingDown => {}
_ => {
tracing::warn!(
"Failed to notify compute of attached pageserver {node}: {e}"
);
}
if !matches!(e, NotifyError::ShuttingDown) {
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
}
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
// needs to retry at some point.
self.compute_notify_failure = true;
}
result
} else {

View File

@@ -774,9 +774,8 @@ impl Scheduler {
if !matches!(context.mode, ScheduleMode::Speculative) {
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
preferred_az,
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
);
}

View File

@@ -2238,14 +2238,9 @@ impl Service {
let waiters = {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, _scheduler) = locked.parts_mut();
let config = ReconcilerConfigBuilder::new()
.tenant_creation_hint(true)
.build();
tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
.filter_map(|(_shard_id, shard)| {
self.maybe_configured_reconcile_shard(shard, nodes, config)
})
.filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
.collect::<Vec<_>>()
};

View File

@@ -707,15 +707,10 @@ impl TenantShard {
if let Some(node_id) = self.intent.get_attached() {
// Populate secondary by demoting the attached node
self.intent.demote_attached(scheduler, *node_id);
modified = true;
} else if self.intent.secondary.is_empty() {
// Populate secondary by scheduling a fresh node
//
// We use [`AttachedShardTag`] because when a secondary location is the only one
// a shard has, we expect that its next use will be as an attached location: we want
// the tenant to be ready to warm up and run fast in their preferred AZ.
let node_id = scheduler.schedule_shard::<AttachedShardTag>(
let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
&[],
&self.intent.preferred_az_id,
context,
@@ -724,17 +719,9 @@ impl TenantShard {
modified = true;
}
while self.intent.secondary.len() > 1 {
// If we have multiple secondaries (e.g. when transitioning from Attached to Secondary and
// having just demoted our attached location), then we should prefer to keep the location
// in our preferred AZ. Tenants in Secondary mode want to be in the preferred AZ so that
// they have a warm location to become attached when transitioning back into Attached.
let mut candidates = self.intent.get_secondary().clone();
// Sort to get secondaries outside preferred AZ last
candidates
.sort_by_key(|n| scheduler.get_node_az(n).as_ref() != self.preferred_az());
let secondary_to_remove = candidates.pop().unwrap();
self.intent.remove_secondary(scheduler, secondary_to_remove);
// We have no particular preference for one secondary location over another: just
// arbitrarily drop from the end
self.intent.pop_secondary(scheduler);
modified = true;
}
}
@@ -980,51 +967,24 @@ impl TenantShard {
),
)
})
.collect::<HashMap<_, _>>();
.collect::<Vec<_>>();
if secondary_scores.iter().any(|score| score.1.is_none()) {
// Trivial case: if we only have one secondary, drop that one
if self.intent.get_secondary().len() == 1 {
return Some(ScheduleOptimization {
sequence: self.sequence,
action: ScheduleOptimizationAction::RemoveSecondary(
*self.intent.get_secondary().first().unwrap(),
),
});
}
// Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state
// where its score can't be calculated), and drop the others. This enables us to make progress in
// most cases, even if some nodes are offline or have scheduling=pause set.
debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
// logic presumes we are in a mode where we want secondaries to be in non-home AZ
if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
let is_available = secondary_scores
.get(n)
.expect("Built from same list of nodes")
.is_some();
is_available && !in_home_az
}) {
// Great, we found one to retain. Pick some other to drop.
if let Some(victim) = self
.intent
.get_secondary()
.iter()
.find(|n| n != &retain_secondary)
{
// Don't have full list of scores, so can't make a good decision about which to drop unless
// there is an obvious one in the wrong AZ
for secondary in self.intent.get_secondary() {
if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
return Some(ScheduleOptimization {
sequence: self.sequence,
action: ScheduleOptimizationAction::RemoveSecondary(*victim),
action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
});
}
}
// Fall through: we didn't identify one to remove. This ought to be rare.
tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
self.intent.get_secondary()
);
self.intent.get_secondary()
);
} else {
let victim = secondary_scores
.iter()
@@ -1033,7 +993,7 @@ impl TenantShard {
.0;
return Some(ScheduleOptimization {
sequence: self.sequence,
action: ScheduleOptimizationAction::RemoveSecondary(*victim),
action: ScheduleOptimizationAction::RemoveSecondary(victim),
});
}
}
@@ -1119,31 +1079,12 @@ impl TenantShard {
None => vec![],
};
let replacement = match &self.policy {
PlacementPolicy::Attached(_) => {
// Secondaries for an attached shard should be scheduled using `SecondaryShardTag`
// to avoid placing them in the preferred AZ.
self.find_better_location::<SecondaryShardTag>(
scheduler,
&schedule_context,
*secondary,
&exclude,
)
}
PlacementPolicy::Secondary => {
// In secondary-only mode, we want our secondary locations in the preferred AZ,
// so that they're ready to take over as an attached location when we transition
// into PlacementPolicy::Attached.
self.find_better_location::<AttachedShardTag>(
scheduler,
&schedule_context,
*secondary,
&exclude,
)
}
PlacementPolicy::Detached => None,
};
let replacement = self.find_better_location::<SecondaryShardTag>(
scheduler,
&schedule_context,
*secondary,
&exclude,
);
assert!(replacement != Some(*secondary));
if let Some(replacement) = replacement {
// We have found a candidate and confirmed that its score is preferable
@@ -2407,110 +2348,6 @@ pub(crate) mod tests {
Ok(())
}
/// Test how the optimisation code behaves with an extra secondary
#[test]
fn optimize_removes_secondary() -> anyhow::Result<()> {
let az_a_tag = AvailabilityZone("az-a".to_string());
let az_b_tag = AvailabilityZone("az-b".to_string());
let mut nodes = make_test_nodes(
4,
&[
az_a_tag.clone(),
az_b_tag.clone(),
az_a_tag.clone(),
az_b_tag.clone(),
],
);
let mut scheduler = Scheduler::new(nodes.values());
let mut schedule_context = ScheduleContext::default();
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
shard_a.intent.preferred_az_id = Some(az_a_tag.clone());
shard_a
.schedule(&mut scheduler, &mut schedule_context)
.unwrap();
// Attached on node 1, secondary on node 2
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]);
// Initially optimiser is idle
assert_eq!(
shard_a.optimize_attachment(&mut scheduler, &schedule_context),
None
);
assert_eq!(
shard_a.optimize_secondary(&mut scheduler, &schedule_context),
None
);
// A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over
// to our new location
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
assert_eq!(
optimization,
Some(ScheduleOptimization {
sequence: shard_a.sequence,
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
})
);
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
// A spare secondary in the non-home AZ, and one of them is offline
shard_a.intent.push_secondary(&mut scheduler, NodeId(4));
nodes
.get_mut(&NodeId(4))
.unwrap()
.set_availability(NodeAvailability::Offline);
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
assert_eq!(
optimization,
Some(ScheduleOptimization {
sequence: shard_a.sequence,
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4))
})
);
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
// A spare secondary when should have none
shard_a.policy = PlacementPolicy::Attached(0);
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
assert_eq!(
optimization,
Some(ScheduleOptimization {
sequence: shard_a.sequence,
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
})
);
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
assert_eq!(shard_a.intent.get_secondary(), &vec![]);
// Check that in secondary mode, we preserve the secondary in the preferred AZ
let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule()
shard_a.policy = PlacementPolicy::Secondary;
shard_a
.schedule(&mut scheduler, &mut schedule_context)
.unwrap();
assert_eq!(shard_a.intent.get_attached(), &None);
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
assert_eq!(
shard_a.optimize_attachment(&mut scheduler, &schedule_context),
None
);
assert_eq!(
shard_a.optimize_secondary(&mut scheduler, &schedule_context),
None
);
shard_a.intent.clear(&mut scheduler);
Ok(())
}
// Optimize til quiescent: this emulates what Service::optimize_all does, when
// called repeatedly in the background.
// Returns the applied optimizations
@@ -2850,108 +2687,4 @@ pub(crate) mod tests {
}
Ok(())
}
/// Check how the shard's scheduling behaves when in PlacementPolicy::Secondary mode.
#[test]
fn tenant_secondary_scheduling() -> anyhow::Result<()> {
let az_a = AvailabilityZone("az-a".to_string());
let nodes = make_test_nodes(
3,
&[
az_a.clone(),
AvailabilityZone("az-b".to_string()),
AvailabilityZone("az-c".to_string()),
],
);
let mut scheduler = Scheduler::new(nodes.values());
let mut context = ScheduleContext::default();
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Secondary);
tenant_shard.intent.preferred_az_id = Some(az_a.clone());
tenant_shard
.schedule(&mut scheduler, &mut context)
.expect("we have enough nodes, scheduling should work");
assert_eq!(tenant_shard.intent.secondary.len(), 1);
assert!(tenant_shard.intent.attached.is_none());
// Should have scheduled into the preferred AZ
assert_eq!(
scheduler
.get_node_az(&tenant_shard.intent.secondary[0])
.as_ref(),
tenant_shard.preferred_az()
);
// Optimizer should agree
assert_eq!(
tenant_shard.optimize_attachment(&mut scheduler, &context),
None
);
assert_eq!(
tenant_shard.optimize_secondary(&mut scheduler, &context),
None
);
// Switch to PlacementPolicy::Attached
tenant_shard.policy = PlacementPolicy::Attached(1);
tenant_shard
.schedule(&mut scheduler, &mut context)
.expect("we have enough nodes, scheduling should work");
assert_eq!(tenant_shard.intent.secondary.len(), 1);
assert!(tenant_shard.intent.attached.is_some());
// Secondary should now be in non-preferred AZ
assert_ne!(
scheduler
.get_node_az(&tenant_shard.intent.secondary[0])
.as_ref(),
tenant_shard.preferred_az()
);
// Attached should be in preferred AZ
assert_eq!(
scheduler
.get_node_az(&tenant_shard.intent.attached.unwrap())
.as_ref(),
tenant_shard.preferred_az()
);
// Optimizer should agree
assert_eq!(
tenant_shard.optimize_attachment(&mut scheduler, &context),
None
);
assert_eq!(
tenant_shard.optimize_secondary(&mut scheduler, &context),
None
);
// Switch back to PlacementPolicy::Secondary
tenant_shard.policy = PlacementPolicy::Secondary;
tenant_shard
.schedule(&mut scheduler, &mut context)
.expect("we have enough nodes, scheduling should work");
assert_eq!(tenant_shard.intent.secondary.len(), 1);
assert!(tenant_shard.intent.attached.is_none());
// When we picked a location to keep, we should have kept the one in the preferred AZ
assert_eq!(
scheduler
.get_node_az(&tenant_shard.intent.secondary[0])
.as_ref(),
tenant_shard.preferred_az()
);
// Optimizer should agree
assert_eq!(
tenant_shard.optimize_attachment(&mut scheduler, &context),
None
);
assert_eq!(
tenant_shard.optimize_secondary(&mut scheduler, &context),
None
);
tenant_shard.intent.clear(&mut scheduler);
Ok(())
}
}

View File

@@ -2766,11 +2766,6 @@ class NeonPageserver(PgProtocol, LogUtils):
log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
raise
def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any:
path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json"
with open(path) as f:
return json.load(f)
def tenant_create(
self,
tenant_id: TenantId,
@@ -3345,7 +3340,7 @@ class NeonProxy(PgProtocol):
metric_collection_interval: str | None = None,
):
host = "127.0.0.1"
domain = "proxy.local.neon.build" # resolves to 127.0.0.1
domain = "proxy.localtest.me" # resolves to 127.0.0.1
super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
self.domain = domain
@@ -3368,7 +3363,7 @@ class NeonProxy(PgProtocol):
# generate key of it doesn't exist
crt_path = self.test_output_dir / "proxy.crt"
key_path = self.test_output_dir / "proxy.key"
generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path)
generate_proxy_tls_certs("*.localtest.me", key_path, crt_path)
args = [
str(self.neon_binpath / "proxy"),
@@ -3569,7 +3564,7 @@ class NeonAuthBroker:
external_http_port: int,
auth_backend: NeonAuthBroker.ProxyV1,
):
self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1
self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1
self.host = "127.0.0.1"
self.http_port = http_port
self.external_http_port = external_http_port
@@ -3586,7 +3581,7 @@ class NeonAuthBroker:
# generate key of it doesn't exist
crt_path = self.test_output_dir / "proxy.crt"
key_path = self.test_output_dir / "proxy.key"
generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path)
generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path)
args = [
str(self.neon_binpath / "proxy"),
@@ -5001,35 +4996,13 @@ def check_restored_datadir_content(
assert (mismatch, error) == ([], [])
# wait for subscriber to catch up with publisher
def logical_replication_sync(
subscriber: PgProtocol,
publisher: PgProtocol,
# pass subname explicitly to avoid confusion
# when multiple subscriptions are present
subname: str,
sub_dbname: str | None = None,
pub_dbname: str | None = None,
):
) -> Lsn:
"""Wait logical replication subscriber to sync with publisher."""
def initial_sync():
# first check if the subscription is active `s`=`synchronized`, `r` = `ready`
query = f"""SELECT 1 FROM pg_subscription_rel join pg_catalog.pg_subscription
on pg_subscription_rel.srsubid = pg_subscription.oid
WHERE srsubstate NOT IN ('r', 's') and subname='{subname}'"""
if sub_dbname is not None:
res = subscriber.safe_psql(query, dbname=sub_dbname)
else:
res = subscriber.safe_psql(query)
assert (res is None) or (len(res) == 0)
wait_until(initial_sync)
# wait for the subscription to catch up with current state of publisher
# caller is responsible to call checkpoint before calling this function
if pub_dbname is not None:
publisher_lsn = Lsn(
publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0]
@@ -5037,23 +5010,23 @@ def logical_replication_sync(
else:
publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
def subscriber_catch_up():
query = f"select latest_end_lsn from pg_catalog.pg_stat_subscription where latest_end_lsn is NOT NULL and subname='{subname}'"
while True:
if sub_dbname is not None:
res = subscriber.safe_psql(query, dbname=sub_dbname)
res = subscriber.safe_psql(
"select latest_end_lsn from pg_catalog.pg_stat_subscription", dbname=sub_dbname
)[0][0]
else:
res = subscriber.safe_psql(query)
res = subscriber.safe_psql(
"select latest_end_lsn from pg_catalog.pg_stat_subscription"
)[0][0]
assert res is not None
res_lsn = res[0][0]
log.info(f"subscriber_lsn={res_lsn}")
subscriber_lsn = Lsn(res_lsn)
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
assert subscriber_lsn >= publisher_lsn
wait_until(subscriber_catch_up)
if res:
log.info(f"subscriber_lsn={res}")
subscriber_lsn = Lsn(res)
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
if subscriber_lsn >= publisher_lsn:
return subscriber_lsn
time.sleep(0.5)
def tenant_get_shards(
@@ -5122,14 +5095,12 @@ def wait_for_last_flush_lsn(
timeline: TimelineId,
pageserver_id: int | None = None,
auth_token: str | None = None,
last_flush_lsn: Lsn | None = None,
) -> Lsn:
"""Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
shards = tenant_get_shards(env, tenant, pageserver_id)
if last_flush_lsn is None:
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
results = []
for tenant_shard_id, pageserver in shards:

View File

@@ -124,8 +124,5 @@ def pytest_runtest_makereport(*args, **kwargs):
allure.dynamic.parameter(
"__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc"
)
allure.dynamic.parameter(
"__sanitizers", "enabled" if os.getenv("SANITIZERS") == "enabled" else "disabled"
)
yield

View File

@@ -282,35 +282,18 @@ class S3Storage:
def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
"""
Gets the latest generation key from a list of keys.
@param index_keys: A list of keys of different generations, which start with `prefix`
"""
def parse_gen(key: str) -> int:
shortname = key.split("/")[-1]
generation_str = shortname.removeprefix(prefix).removesuffix(suffix)
try:
return int(generation_str, base=16)
except ValueError:
log.info(f"Ignoring non-matching key: {key}")
return -1
if len(keys) == 0:
raise IndexError("No keys found")
return max(keys, key=parse_gen)
def get_latest_index_key(self, index_keys: list[str]) -> str:
"""
Gets the latest index file key.
@param index_keys: A list of index keys of different generations.
"""
key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys)
return key
def parse_gen(index_key: str) -> int:
parts = index_key.split("index_part.json-")
return int(parts[-1], base=16) if len(parts) == 2 else -1
return max(index_keys, key=parse_gen)
def download_index_part(self, index_key: str) -> IndexPartDump:
"""
@@ -323,29 +306,6 @@ class S3Storage:
log.info(f"index_part.json: {body}")
return IndexPartDump.from_json(json.loads(body))
def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None:
tenant_prefix = self.tenant_path(tenant_id)
objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[
"Contents"
]
keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1]
try:
manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys)
except IndexError:
log.info(
f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet"
)
return None
response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key)
body = response["Body"].read().decode("utf-8")
log.info(f"Downloaded manifest {manifest_key}: {body}")
manifest = json.loads(body)
assert isinstance(manifest, dict)
return manifest
def heatmap_key(self, tenant_id: TenantId) -> str:
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"

View File

@@ -1,18 +1,13 @@
# Logical replication tests
> [!NOTE]
> Neon project should have logical replication enabled:
>
> https://neon.tech/docs/guides/logical-replication-postgres#enable-logical-replication-in-the-source-neon-project
## Clickhouse
```bash
export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml up -d
./scripts/pytest -m remote_cluster -k test_clickhouse
docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml down
docker compose -f clickhouse/docker-compose.yml up -d
pytest -m remote_cluster -k test_clickhouse
docker compose -f clickhouse/docker-compose.yml down
```
## Debezium
@@ -20,7 +15,8 @@ docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml down
```bash
export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
docker compose -f test_runner/logical_repl/debezium/docker-compose.yml up -d
./scripts/pytest -m remote_cluster -k test_debezium
docker compose -f test_runner/logical_repl/debezium/docker-compose.yml down
```
docker compose -f debezium/docker-compose.yml up -d
pytest -m remote_cluster -k test_debezium
docker compose -f debezium/docker-compose.yml down
```

View File

@@ -76,9 +76,6 @@ def test_ingest_logical_message(
log.info("Waiting for Pageserver to catch up")
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
recover_to_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
endpoint.stop()
# Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
# reingest all the WAL from the safekeeper without any other constraints. This gives us a
# baseline of how fast the pageserver can ingest this WAL in isolation.
@@ -91,13 +88,7 @@ def test_ingest_logical_message(
with zenbenchmark.record_duration("pageserver_recover_ingest"):
log.info("Recovering WAL into pageserver")
client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
wait_for_last_flush_lsn(
env, endpoint, env.initial_tenant, env.initial_timeline, last_flush_lsn=recover_to_lsn
)
# Check endpoint can start, i.e. we really recovered
endpoint.start()
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
# Emit metrics.
wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))

View File

@@ -34,20 +34,16 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
cur.execute("set log_statement = 'all'")
cur.execute("create table t(x integer)")
for _ in range(n_iters):
with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"):
cur.execute(f"insert into t values (generate_series(1,{n_records}))")
cur.execute(f"insert into t values (generate_series(1,{n_records}))")
time.sleep(1)
with zenbenchmark.record_duration("vacuum t"):
cur.execute("vacuum t")
cur.execute("vacuum t")
with zenbenchmark.record_duration("SELECT count(*) from t"):
with zenbenchmark.record_duration("test_query"):
cur.execute("SELECT count(*) from t")
assert cur.fetchone() == (n_iters * n_records,)
with zenbenchmark.record_duration("flush_ep_to_pageserver"):
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
with zenbenchmark.record_duration("timeline_checkpoint"):
env.pageserver.http_client().timeline_checkpoint(
tenant, timeline, compact=False, wait_until_uploaded=True
)
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
env.pageserver.http_client().timeline_checkpoint(
tenant, timeline, compact=False, wait_until_uploaded=True
)

View File

@@ -44,13 +44,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
# Wait logical replication channel to be established
logical_replication_sync(vanilla_pg, endpoint, "sub1")
logical_replication_sync(vanilla_pg, endpoint)
pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()])
# Wait logical replication to sync
start = time.time()
logical_replication_sync(vanilla_pg, endpoint, "sub1")
logical_replication_sync(vanilla_pg, endpoint)
log.info(f"Sync with master took {time.time() - start} seconds")
sum_master = cast("int", endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0])

View File

@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
"LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
"PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
"PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
"PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
"PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
}
# Combine the current environment with custom variables
env = os.environ.copy()

View File

@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "openssl"
version = "0.10.70"
version = "0.10.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
dependencies = [
"bitflags 2.6.0",
"cfg-if",
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.105"
version = "0.9.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
dependencies = [
"cc",
"libc",

View File

@@ -184,7 +184,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"gc_compaction_enabled": True,
"gc_compaction_initial_threshold_kb": 1024000,
"gc_compaction_ratio_percent": 200,
"image_creation_preempt_threshold": 5,
}
vps_http = env.storage_controller.pageserver_api()

View File

@@ -29,21 +29,6 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
# "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
}
PREEMPT_COMPACTION_TENANT_CONF = {
"gc_period": "5s",
"compaction_period": "5s",
# Small checkpoint distance to create many layers
"checkpoint_distance": 1024**2,
# Compact small layers
"compaction_target_size": 1024**2,
"image_creation_threshold": 1,
"image_creation_preempt_threshold": 1,
# compact more frequently
"compaction_threshold": 3,
"compaction_upper_limit": 6,
"lsn_lease_length": "0s",
}
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
@@ -51,8 +36,7 @@ PREEMPT_COMPACTION_TENANT_CONF = {
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
)
def test_pageserver_compaction_smoke(
neon_env_builder: NeonEnvBuilder,
wal_receiver_protocol: PageserverWalReceiverProtocol,
neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
):
"""
This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -70,8 +54,7 @@ def test_pageserver_compaction_smoke(
page_cache_size=10
"""
conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
env = neon_env_builder.init_start(initial_tenant_conf=conf)
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -130,41 +113,6 @@ page_cache_size=10
assert vectored_average < 8
@skip_in_debug_build("only run with release build")
def test_pageserver_compaction_preempt(
neon_env_builder: NeonEnvBuilder,
):
# Ideally we should be able to do unit tests for this, but we need real Postgres
# WALs in order to do unit testing...
conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
env = neon_env_builder.init_start(initial_tenant_conf=conf)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
row_count = 200000
churn_rounds = 10
ps_http = env.pageserver.http_client()
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageserver.id)
log.info("Writing initial data ...")
workload.write_rows(row_count, env.pageserver.id)
for i in range(1, churn_rounds + 1):
log.info(f"Running churn round {i}/{churn_rounds} ...")
workload.churn_rows(row_count, env.pageserver.id, upload=False)
workload.validate(env.pageserver.id)
ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)
# ensure image layer creation gets preempted and then resumed
env.pageserver.assert_log_contains("resuming image layer creation")
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
"with_branches",
@@ -302,9 +250,6 @@ def test_pageserver_gc_compaction_idempotent(
workload.churn_rows(row_count, env.pageserver.id)
# compact 3 times if mode is before_restart
n_compactions = 3 if compaction_mode == "before_restart" else 1
ps_http.timeline_compact(
tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True
)
for _ in range(n_compactions):
# Force refresh gc info to have gc_cutoff generated
ps_http.timeline_gc(tenant_id, timeline_id, None)

View File

@@ -314,10 +314,7 @@ def test_forward_compatibility(
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
ep = env.endpoints.create("main")
ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")}
ep.start(env=ep_env)
ep = env.endpoints.create_start("main")
connstr = ep.connstr()
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
@@ -366,7 +363,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
)
# Timeline exists again: restart the endpoint
ep.start(env=ep_env)
ep.start()
pg_bin.run_capture(
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
@@ -474,14 +471,6 @@ HISTORIC_DATA_SETS = [
PgVersion.V16,
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
),
# This dataset created on a pageserver running modern code at time of capture, but configured with no generation. This
# is our regression test that we can load data written without generations in layer file names & indices
HistoricDataSet(
"2025-02-07-nogenerations",
TenantId("e1411ca6562d6ff62419f693a5695d67"),
PgVersion.V17,
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst",
),
]

View File

@@ -183,7 +183,6 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');")
cursor.execute("CREATE TABLE t(a int)")
cursor.execute("INSERT INTO t VALUES (1)")
cursor.execute("CHECKPOINT")
# connect to the subscriber_db and create a subscription
# Note that we need to create subscription with
@@ -196,11 +195,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
# wait for the subscription to be active
logical_replication_sync(
endpoint,
endpoint,
"mysub",
sub_dbname="subscriber_db",
pub_dbname="publisher_db",
endpoint, endpoint, sub_dbname="subscriber_db", pub_dbname="publisher_db"
)
# Check that replication is working

View File

@@ -95,8 +95,6 @@ def test_remote_extensions(
# mock remote_extensions spec
spec: dict[str, Any] = {
"public_extensions": ["anon"],
"custom_extensions": None,
"library_index": {
"anon": "anon",
},

Some files were not shown because too many files have changed in this diff Show More