mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-17 01:20:36 +00:00
Compare commits
67 Commits
sk-proto-v
...
tristan957
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e7ec82cc08 | ||
|
|
3d143ad799 | ||
|
|
b0c7ee0175 | ||
|
|
8c4e94107d | ||
|
|
c368b0fe14 | ||
|
|
aba61a3712 | ||
|
|
946da3f7e2 | ||
|
|
73633e27ed | ||
|
|
0cf0119751 | ||
|
|
b37f52fdf1 | ||
|
|
443c8d0b4b | ||
|
|
2f36bdb218 | ||
|
|
e7118213ab | ||
|
|
d204d51faf | ||
|
|
ac55e2dbe5 | ||
|
|
874accd6ed | ||
|
|
6cd3b501ec | ||
|
|
bf20d78292 | ||
|
|
2656c713a4 | ||
|
|
5e95860e70 | ||
|
|
0abff59e97 | ||
|
|
9609f7547e | ||
|
|
d6e87a3a9c | ||
|
|
f5243992fa | ||
|
|
95220ba43e | ||
|
|
08f92bb916 | ||
|
|
8f651f9582 | ||
|
|
b5a239c4ae | ||
|
|
de05258419 | ||
|
|
e73d681a0e | ||
|
|
44b905d14b | ||
|
|
186199f406 | ||
|
|
82cbab7512 | ||
|
|
2943590694 | ||
|
|
df06c41085 | ||
|
|
ddd7c36343 | ||
|
|
839f41f5bb | ||
|
|
f22d41eaec | ||
|
|
977781e423 | ||
|
|
67b71538d0 | ||
|
|
f4cfa725b8 | ||
|
|
05326cc247 | ||
|
|
b66fbd6176 | ||
|
|
95588dab98 | ||
|
|
1686d9e733 | ||
|
|
abcd00181c | ||
|
|
01f0be03b5 | ||
|
|
81cd30e4d6 | ||
|
|
7fc6953da4 | ||
|
|
77f9e74d86 | ||
|
|
0ceeec9be3 | ||
|
|
733a57247b | ||
|
|
6699a30a49 | ||
|
|
133b89a83d | ||
|
|
fba22a7123 | ||
|
|
14e05276a3 | ||
|
|
ebc55e6ae8 | ||
|
|
f07119cca7 | ||
|
|
47975d06d9 | ||
|
|
472007dd7c | ||
|
|
f9009d6b80 | ||
|
|
cab60b6d9f | ||
|
|
06090bbccd | ||
|
|
dcf335a251 | ||
|
|
b6e9daea9a | ||
|
|
d5c3a4e2b9 | ||
|
|
8107140f7f |
@@ -24,3 +24,4 @@
|
||||
!storage_controller/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
!build_tools/patches
|
||||
|
||||
@@ -121,6 +121,8 @@ runs:
|
||||
export DEFAULT_PG_VERSION=${PG_VERSION#v}
|
||||
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
|
||||
export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
|
||||
export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1
|
||||
export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1
|
||||
|
||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||
export REMOTE_ENV=1
|
||||
|
||||
37
.github/workflows/_build-and-test-locally.yml
vendored
37
.github/workflows/_build-and-test-locally.yml
vendored
@@ -23,6 +23,11 @@ on:
|
||||
description: 'a json object of postgres versions and lfc states to run regression tests on'
|
||||
required: true
|
||||
type: string
|
||||
sanitizers:
|
||||
description: 'enabled or disabled'
|
||||
required: false
|
||||
default: 'disabled'
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -87,6 +92,7 @@ jobs:
|
||||
- name: Set env variables
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
@@ -99,8 +105,14 @@ jobs:
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
fi
|
||||
if [[ $SANITIZERS == 'enabled' ]]; then
|
||||
make_vars="WITH_SANITIZERS=yes"
|
||||
else
|
||||
make_vars=""
|
||||
fi
|
||||
{
|
||||
echo "cov_prefix=${cov_prefix}"
|
||||
echo "make_vars=${make_vars}"
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}"
|
||||
echo "CARGO_FLAGS=${CARGO_FLAGS}"
|
||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
|
||||
@@ -136,35 +148,39 @@ jobs:
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
run: mold -run make ${make_vars} postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
run: mold -run make ${make_vars} postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v16 -j$(nproc)
|
||||
run: mold -run make ${make_vars} postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build postgres v17
|
||||
if: steps.cache_pg_17.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v17 -j$(nproc)
|
||||
run: mold -run make ${make_vars} postgres-v17 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
run: mold -run make ${make_vars} neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
run: mold -run make ${make_vars} walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
env:
|
||||
WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }}
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
export ASAN_OPTIONS=detect_leaks=0
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}
|
||||
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Install rust binaries
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
@@ -179,7 +195,7 @@ jobs:
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
|
||||
@@ -212,6 +228,7 @@ jobs:
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Run rust tests
|
||||
if: ${{ inputs.sanitizers != 'enabled' }}
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
@@ -273,6 +290,7 @@ jobs:
|
||||
DATABASE_URL: postgresql://localhost:1235/storage_controller
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
run: |
|
||||
export ASAN_OPTIONS=detect_leaks=0
|
||||
/tmp/neon/bin/neon_local init
|
||||
/tmp/neon/bin/neon_local storage_controller start
|
||||
|
||||
@@ -319,7 +337,7 @@ jobs:
|
||||
- name: Pytest regression tests
|
||||
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }}
|
||||
with:
|
||||
build_type: ${{ inputs.build-type }}
|
||||
test_selection: regress
|
||||
@@ -337,6 +355,7 @@ jobs:
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
|
||||
14
.github/workflows/approved-for-ci-run.yml
vendored
14
.github/workflows/approved-for-ci-run.yml
vendored
@@ -67,9 +67,9 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
|
||||
- name: Look for existing PR
|
||||
id: get-pr
|
||||
env:
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
run: |
|
||||
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
|
||||
echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
|
||||
|
||||
|
||||
- name: Get changed labels
|
||||
id: get-labels
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
@@ -94,10 +94,6 @@ jobs:
|
||||
echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
|
||||
echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
|
||||
- run: git checkout -b "${BRANCH}"
|
||||
|
||||
- run: git push --force origin "${BRANCH}"
|
||||
@@ -105,7 +101,7 @@ jobs:
|
||||
|
||||
- name: Create a Pull Request for CI run (if required)
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED == ''
|
||||
env:
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
@@ -142,7 +138,7 @@ jobs:
|
||||
|
||||
- run: git push --force origin "${BRANCH}"
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
|
||||
|
||||
cleanup:
|
||||
# Close PRs and delete branchs if the original PR is closed.
|
||||
|
||||
|
||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -682,7 +682,7 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
target: neon-pg-ext-test
|
||||
target: extension-tests
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
134
.github/workflows/build_and_test_with_sanitizers.yml
vendored
Normal file
134
.github/workflows/build_and_test_with_sanitizers.yml
vendored
Normal file
@@ -0,0 +1,134 @@
|
||||
name: Build and Test with Sanitizers
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 1 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch:
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
|
||||
steps:
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get build tag
|
||||
run: |
|
||||
echo run:$GITHUB_RUN_ID
|
||||
echo ref:$GITHUB_REF_NAME
|
||||
echo rev:$(git rev-list --count HEAD)
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-build-tools-image:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
build-type: [ release ]
|
||||
uses: ./.github/workflows/_build-and-test-locally.yml
|
||||
with:
|
||||
arch: ${{ matrix.arch }}
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
test-cfg: '[{"pg_version":"v17"}]'
|
||||
sanitizers: enabled
|
||||
secrets: inherit
|
||||
|
||||
|
||||
create-test-report:
|
||||
needs: [ build-and-test-locally, build-build-tools-image ]
|
||||
if: ${{ !cancelled() }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v7
|
||||
if: ${{ !cancelled() }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const report = {
|
||||
reportUrl: "${{ steps.create-allure-report.outputs.report-url }}",
|
||||
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
|
||||
}
|
||||
|
||||
const coverage = {}
|
||||
|
||||
const script = require("./scripts/comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
fetch,
|
||||
report,
|
||||
coverage,
|
||||
})
|
||||
130
Cargo.lock
generated
130
Cargo.lock
generated
@@ -206,6 +206,16 @@ dependencies = [
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "assert-json-diff"
|
||||
version = "2.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-channel"
|
||||
version = "1.9.0"
|
||||
@@ -290,9 +300,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
||||
|
||||
[[package]]
|
||||
name = "aws-config"
|
||||
version = "1.5.15"
|
||||
version = "1.5.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
|
||||
checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
@@ -301,7 +311,7 @@ dependencies = [
|
||||
"aws-sdk-sts",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-json 0.60.7",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
@@ -332,9 +342,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-runtime"
|
||||
version = "1.5.4"
|
||||
version = "1.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
|
||||
checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-sigv4",
|
||||
@@ -366,7 +376,7 @@ dependencies = [
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-json 0.61.1",
|
||||
"aws-smithy-query",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -389,7 +399,7 @@ dependencies = [
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-json 0.61.1",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
@@ -414,7 +424,7 @@ dependencies = [
|
||||
"aws-smithy-checksums",
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-json 0.61.1",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
@@ -437,15 +447,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sso"
|
||||
version = "1.57.0"
|
||||
version = "1.50.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
|
||||
checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-json 0.61.1",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
@@ -459,15 +469,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-ssooidc"
|
||||
version = "1.58.0"
|
||||
version = "1.51.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
|
||||
checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-json 0.61.1",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
@@ -481,15 +491,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sts"
|
||||
version = "1.58.0"
|
||||
version = "1.51.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
|
||||
checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-json 0.61.1",
|
||||
"aws-smithy-query",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -504,9 +514,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sigv4"
|
||||
version = "1.2.7"
|
||||
version = "1.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
|
||||
checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-eventstream",
|
||||
@@ -533,9 +543,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-async"
|
||||
version = "1.2.4"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
|
||||
checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"pin-project-lite",
|
||||
@@ -565,9 +575,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-eventstream"
|
||||
version = "0.60.6"
|
||||
version = "0.60.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
|
||||
checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
@@ -576,9 +586,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-http"
|
||||
version = "0.60.12"
|
||||
version = "0.60.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
|
||||
checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
|
||||
dependencies = [
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -597,9 +607,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-json"
|
||||
version = "0.61.2"
|
||||
version = "0.60.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
|
||||
checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-json"
|
||||
version = "0.61.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
]
|
||||
@@ -616,9 +635,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-runtime"
|
||||
version = "1.7.7"
|
||||
version = "1.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
|
||||
checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
@@ -660,9 +679,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-types"
|
||||
version = "1.2.12"
|
||||
version = "1.2.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
|
||||
checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
|
||||
dependencies = [
|
||||
"base64-simd",
|
||||
"bytes",
|
||||
@@ -695,9 +714,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-types"
|
||||
version = "1.3.4"
|
||||
version = "1.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
|
||||
checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-async",
|
||||
@@ -1010,6 +1029,12 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxcar"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.5.0"
|
||||
@@ -2433,6 +2458,16 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gettid"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gimli"
|
||||
version = "0.31.1"
|
||||
@@ -4212,6 +4247,16 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"seize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.1.1"
|
||||
@@ -4839,6 +4884,7 @@ dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"assert-json-diff",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
@@ -4846,6 +4892,7 @@ dependencies = [
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"base64 0.13.1",
|
||||
"boxcar",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -4862,6 +4909,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"gettid",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
@@ -4884,7 +4932,9 @@ dependencies = [
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"p256 0.13.2",
|
||||
"papaya",
|
||||
"parking_lot 0.12.1",
|
||||
"parquet",
|
||||
"parquet_derive",
|
||||
@@ -4931,6 +4981,9 @@ dependencies = [
|
||||
"tokio-tungstenite 0.21.0",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-log",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-serde",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"try-lock",
|
||||
@@ -5884,6 +5937,16 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "seize"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.17"
|
||||
@@ -8145,6 +8208,7 @@ dependencies = [
|
||||
"tower 0.4.13",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"url",
|
||||
"zerocopy",
|
||||
"zeroize",
|
||||
|
||||
@@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
backtrace = "0.3.74"
|
||||
flate2 = "1.0.26"
|
||||
assert-json-diff = "2"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
|
||||
@@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
|
||||
tower-service = "0.3.3"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2"
|
||||
tracing-log = "0.2"
|
||||
tracing-opentelemetry = "0.28"
|
||||
tracing-serde = "0.2.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
|
||||
17
Makefile
17
Makefile
@@ -10,18 +10,29 @@ ICU_PREFIX_DIR := /usr/local/icu
|
||||
# environment variable.
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
WITH_SANITIZERS ?= no
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
ifeq ($(WITH_SANITIZERS),yes)
|
||||
PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover
|
||||
COPT += -Wno-error # to avoid failing on warnings induced by sanitizers
|
||||
PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS)
|
||||
export CC := gcc
|
||||
export ASAN_OPTIONS := detect_leaks=0
|
||||
endif
|
||||
|
||||
ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
|
||||
# Exclude static build openssl, icu for local build (MacOS, Linux)
|
||||
# Only keep for build type release and debug
|
||||
@@ -33,7 +44,9 @@ endif
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
# Seccomp BPF is only available for Linux
|
||||
PG_CONFIGURE_OPTS += --with-libseccomp
|
||||
ifneq ($(WITH_SANITIZERS),yes)
|
||||
PG_CONFIGURE_OPTS += --with-libseccomp
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Darwin)
|
||||
PG_CFLAGS += -DUSE_PREFETCH
|
||||
ifndef DISABLE_HOMEBREW
|
||||
@@ -106,7 +119,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
|
||||
CFLAGS='$(PG_CFLAGS)' \
|
||||
CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
|
||||
|
||||
|
||||
@@ -3,10 +3,17 @@ ARG DEBIAN_VERSION=bookworm
|
||||
FROM debian:bookworm-slim AS pgcopydb_builder
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
|
||||
# By default, /bin/sh used in debian images will treat '\n' as eol,
|
||||
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
apt update && \
|
||||
@@ -39,6 +46,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
mkdir /tmp/pgcopydb && \
|
||||
tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
|
||||
cd /tmp/pgcopydb && \
|
||||
patch -p1 < /pgcopydbv017.patch && \
|
||||
make -s clean && \
|
||||
make -s -j12 install && \
|
||||
libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
|
||||
@@ -55,7 +63,8 @@ ARG DEBIAN_VERSION
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
# Use strict mode for bash to catch errors early
|
||||
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
|
||||
RUN mkdir -p /pgcopydb/bin && \
|
||||
mkdir -p /pgcopydb/lib && \
|
||||
@@ -66,7 +75,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
|
||||
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
|
||||
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
# System deps
|
||||
@@ -190,8 +199,14 @@ RUN set -e \
|
||||
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
|
||||
# And patches from us:
|
||||
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
|
||||
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
|
||||
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
RUN set +o pipefail && \
|
||||
for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
|
||||
yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
|
||||
done && \
|
||||
set -o pipefail
|
||||
# Split into separate step to debug flaky failures here
|
||||
RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
&& ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
|
||||
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
|
||||
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
|
||||
&& cd lcov \
|
||||
|
||||
57
build_tools/patches/pgcopydbv017.patch
Normal file
57
build_tools/patches/pgcopydbv017.patch
Normal file
@@ -0,0 +1,57 @@
|
||||
diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
|
||||
index d730b03..69a9be9 100644
|
||||
--- a/src/bin/pgcopydb/copydb.c
|
||||
+++ b/src/bin/pgcopydb/copydb.c
|
||||
@@ -44,6 +44,7 @@ GUC dstSettings[] = {
|
||||
{ "synchronous_commit", "'off'" },
|
||||
{ "statement_timeout", "0" },
|
||||
{ "lock_timeout", "0" },
|
||||
+ { "idle_in_transaction_session_timeout", "0" },
|
||||
{ NULL, NULL },
|
||||
};
|
||||
|
||||
diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
|
||||
index 94f2f46..e051ba8 100644
|
||||
--- a/src/bin/pgcopydb/pgsql.c
|
||||
+++ b/src/bin/pgcopydb/pgsql.c
|
||||
@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
|
||||
|
||||
LinesBuffer lbuf = { 0 };
|
||||
|
||||
+ if (message != NULL){
|
||||
+ // make sure message is writable by splitLines
|
||||
+ message = strdup(message);
|
||||
+ }
|
||||
+
|
||||
if (!splitLines(&lbuf, message))
|
||||
{
|
||||
/* errors have already been logged */
|
||||
@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
|
||||
PQbackendPID(pgsql->connection),
|
||||
lbuf.lines[lineNumber]);
|
||||
}
|
||||
+ free(message); // free copy of message we created above
|
||||
|
||||
if (pgsql->logSQL)
|
||||
{
|
||||
@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
|
||||
/* errors have already been logged */
|
||||
return;
|
||||
}
|
||||
-
|
||||
if (res != NULL)
|
||||
{
|
||||
char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
|
||||
- strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
|
||||
+ if (sqlstate == NULL)
|
||||
+ {
|
||||
+ // PQresultErrorField returned NULL!
|
||||
+ pgsql->sqlstate[0] = '\0'; // Set to an empty string to avoid segfault
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
|
||||
+ }
|
||||
}
|
||||
|
||||
char *endpoint =
|
||||
File diff suppressed because it is too large
Load Diff
@@ -47,7 +47,9 @@ files:
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
|
||||
# regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
|
||||
#
|
||||
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
|
||||
@@ -41,6 +41,7 @@ use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::SystemTime;
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -85,6 +86,19 @@ fn parse_remote_ext_config(arg: &str) -> Result<String> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a compute ID if one is not supplied. This exists to keep forward
|
||||
/// compatibility tests working, but will be removed in a future iteration.
|
||||
fn generate_compute_id() -> String {
|
||||
let now = SystemTime::now();
|
||||
|
||||
format!(
|
||||
"compute-{}",
|
||||
now.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(rename_all = "kebab-case")]
|
||||
struct Cli {
|
||||
@@ -130,10 +144,10 @@ struct Cli {
|
||||
#[arg(short = 'S', long, group = "spec-path")]
|
||||
pub spec_path: Option<OsString>,
|
||||
|
||||
#[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
|
||||
pub compute_id: Option<String>,
|
||||
#[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
|
||||
pub compute_id: String,
|
||||
|
||||
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
|
||||
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
|
||||
pub control_plane_uri: Option<String>,
|
||||
}
|
||||
|
||||
@@ -259,20 +273,11 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
});
|
||||
}
|
||||
|
||||
if cli.compute_id.is_none() {
|
||||
panic!(
|
||||
"compute spec should be provided by one of the following ways: \
|
||||
--spec OR --spec-path OR --control-plane-uri and --compute-id"
|
||||
);
|
||||
};
|
||||
if cli.control_plane_uri.is_none() {
|
||||
panic!("must specify both --control-plane-uri and --compute-id or none");
|
||||
panic!("must specify --control-plane-uri");
|
||||
};
|
||||
|
||||
match get_spec_from_control_plane(
|
||||
cli.control_plane_uri.as_ref().unwrap(),
|
||||
cli.compute_id.as_ref().unwrap(),
|
||||
) {
|
||||
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(spec) => Ok(CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed: true,
|
||||
@@ -319,6 +324,7 @@ fn wait_spec(
|
||||
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
|
||||
.context("cannot build tokio postgres config from connstr")?;
|
||||
let compute_node = ComputeNode {
|
||||
compute_id: cli.compute_id.clone(),
|
||||
connstr,
|
||||
conn_conf,
|
||||
tokio_conn_conf,
|
||||
|
||||
@@ -231,6 +231,14 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir)
|
||||
.env(
|
||||
"ASAN_OPTIONS",
|
||||
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
|
||||
)
|
||||
.env(
|
||||
"UBSAN_OPTIONS",
|
||||
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
|
||||
)
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()
|
||||
|
||||
@@ -59,6 +59,8 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
/// The ID of the compute
|
||||
pub compute_id: String,
|
||||
// Url type maintains proper escaping
|
||||
pub connstr: url::Url,
|
||||
// We connect to Postgres from many different places, so build configs once
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use postgres::Client;
|
||||
use reqwest::StatusCode;
|
||||
use serde::Deserialize;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
@@ -11,9 +12,26 @@ use crate::migration::MigrationRunner;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ControlPlaneSpecResponse {
|
||||
pub spec: Option<ComputeSpec>,
|
||||
pub status: ControlPlaneComputeStatus,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ControlPlaneComputeStatus {
|
||||
// Compute is known to control-plane, but it's not
|
||||
// yet attached to any timeline / endpoint.
|
||||
Empty,
|
||||
// Compute is attached to some timeline / endpoint and
|
||||
// should be able to start with provided spec.
|
||||
Attached,
|
||||
}
|
||||
|
||||
// Do control plane request and return response if any. In case of error it
|
||||
// returns a bool flag indicating whether it makes sense to retry the request
|
||||
// and a string with error message.
|
||||
|
||||
@@ -261,7 +261,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);
|
||||
|
||||
// Pass through these environment variables to the command
|
||||
for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
|
||||
for var in [
|
||||
"LLVM_PROFILE_FILE",
|
||||
"FAILPOINTS",
|
||||
"RUST_LOG",
|
||||
"ASAN_OPTIONS",
|
||||
"UBSAN_OPTIONS",
|
||||
] {
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
filled_cmd = filled_cmd.env(var, val);
|
||||
}
|
||||
|
||||
@@ -665,6 +665,22 @@ impl Endpoint {
|
||||
.to_str()
|
||||
.unwrap(),
|
||||
])
|
||||
// TODO: It would be nice if we generated compute IDs with the same
|
||||
// algorithm as the real control plane.
|
||||
//
|
||||
// TODO: Add this back when
|
||||
// https://github.com/neondatabase/neon/pull/10747 is merged.
|
||||
//
|
||||
//.args([
|
||||
// "--compute-id",
|
||||
// &format!(
|
||||
// "compute-{}",
|
||||
// SystemTime::now()
|
||||
// .duration_since(UNIX_EPOCH)
|
||||
// .unwrap()
|
||||
// .as_secs()
|
||||
// ),
|
||||
//])
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
|
||||
@@ -221,7 +221,17 @@ impl StorageController {
|
||||
"-p",
|
||||
&format!("{}", postgres_port),
|
||||
];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
|
||||
let envs = [
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
];
|
||||
let exitcode = Command::new(bin_path)
|
||||
.args(args)
|
||||
.envs(envs)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
@@ -242,6 +252,11 @@ impl StorageController {
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let createdb_path = pg_bin_dir.join("createdb");
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
|
||||
let envs = [
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
];
|
||||
let output = Command::new(&createdb_path)
|
||||
.args([
|
||||
"-h",
|
||||
@@ -254,6 +269,7 @@ impl StorageController {
|
||||
&username(),
|
||||
DB_NAME,
|
||||
])
|
||||
.envs(envs)
|
||||
.output()
|
||||
.await
|
||||
.expect("Failed to spawn createdb");
|
||||
|
||||
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
[licenses]
|
||||
allow = [
|
||||
"0BSD",
|
||||
"Apache-2.0",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
|
||||
@@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
|
||||
if [ $pg_version -ge 16 ]; then
|
||||
docker cp ext-src $TEST_CONTAINER_NAME:/
|
||||
docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
|
||||
4
docker-compose/ext-src/pgjwt-src/neon-test.sh
Executable file
4
docker-compose/ext-src/pgjwt-src/neon-test.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
pg_prove test.sql
|
||||
15
docker-compose/ext-src/pgjwt-src/test-upgrade.patch
Normal file
15
docker-compose/ext-src/pgjwt-src/test-upgrade.patch
Normal file
@@ -0,0 +1,15 @@
|
||||
diff --git a/test.sql b/test.sql
|
||||
index d7a0ca8..f15bc76 100644
|
||||
--- a/test.sql
|
||||
+++ b/test.sql
|
||||
@@ -9,9 +9,7 @@
|
||||
\set ON_ERROR_STOP true
|
||||
\set QUIET 1
|
||||
|
||||
-CREATE EXTENSION pgcrypto;
|
||||
-CREATE EXTENSION pgtap;
|
||||
-CREATE EXTENSION pgjwt;
|
||||
+CREATE EXTENSION IF NOT EXISTS pgtap;
|
||||
|
||||
BEGIN;
|
||||
SELECT plan(23);
|
||||
5
docker-compose/ext-src/pgjwt-src/test-upgrade.sh
Executable file
5
docker-compose/ext-src/pgjwt-src/test-upgrade.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
pg_prove -d contrib_regression test.sql
|
||||
@@ -24,7 +24,7 @@ function wait_for_ready {
|
||||
}
|
||||
function create_extensions() {
|
||||
for ext in ${1}; do
|
||||
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
|
||||
docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
|
||||
done
|
||||
}
|
||||
EXTENSIONS='[
|
||||
@@ -40,7 +40,8 @@ EXTENSIONS='[
|
||||
{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
|
||||
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
|
||||
{"extname": "semver", "extdir": "pg_semver-src"},
|
||||
{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
|
||||
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
|
||||
{"extname": "pgjwt", "extdir": "pgjwt-src"}
|
||||
]'
|
||||
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
|
||||
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
|
||||
@@ -285,10 +285,10 @@ To summarize, list of cplane changes:
|
||||
|
||||
### storage_controller implementation
|
||||
|
||||
Current 'load everything on startup and keep in memory' easy design is fine.
|
||||
Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
|
||||
byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
|
||||
10^6 of timelines shouldn't take more than 100MB.
|
||||
If desired, we may continue using current 'load everything on startup and keep
|
||||
in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16
|
||||
byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids
|
||||
plus some flags), so 10^6 of timelines shouldn't take more than 100MB.
|
||||
|
||||
Similar to pageserver attachment Intents storage_controller would have in-memory
|
||||
`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
|
||||
@@ -296,7 +296,7 @@ to make these request reality; this ensures one instance of storage_controller
|
||||
won't do several migrations on the same timeline concurrently. In the first
|
||||
version it is simpler to have more manual control and no retries, i.e. migration
|
||||
failure removes the request. Later we can build retries and automatic
|
||||
scheduling/migration. `MigrationRequest` is
|
||||
scheduling/migration around. `MigrationRequest` is
|
||||
```
|
||||
enum MigrationRequest {
|
||||
To(Vec<NodeId>),
|
||||
@@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually).
|
||||
#### Schema
|
||||
|
||||
`safekeepers` table mirroring current `nodes` should be added, except that for
|
||||
`scheduling_policy` field (seems like `status` is a better name for it): it is enough
|
||||
to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
|
||||
`decomissioned`.
|
||||
`scheduling_policy`: it is enough to have at least in the beginning only 3
|
||||
fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
|
||||
3) `decomissioned` (node is removed).
|
||||
|
||||
`timelines` table:
|
||||
```
|
||||
@@ -324,18 +324,24 @@ table! {
|
||||
timelines (tenant_id, timeline_id) {
|
||||
timeline_id -> Varchar,
|
||||
tenant_id -> Varchar,
|
||||
start_lsn -> pg_lsn,
|
||||
generation -> Int4,
|
||||
sk_set -> Array<Int4>, // list of safekeeper ids
|
||||
new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
|
||||
new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
|
||||
cplane_notified_generation -> Int4,
|
||||
deleted_at -> Nullable<Timestamptz>,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`start_lsn` is needed to create timeline on safekeepers properly, see below. We
|
||||
might also want to add ancestor_timeline_id to preserve the hierarchy, but for
|
||||
this RFC it is not needed.
|
||||
|
||||
#### API
|
||||
|
||||
Node management is similar to pageserver:
|
||||
1) POST `/control/v1/safekeepers` upserts safekeeper.
|
||||
1) POST `/control/v1/safekeepers` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeepers` lists safekeepers.
|
||||
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
|
||||
@@ -345,25 +351,15 @@ Node management is similar to pageserver:
|
||||
Safekeeper deploy scripts should register safekeeper at storage_contorller as
|
||||
they currently do with cplane, under the same id.
|
||||
|
||||
Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
|
||||
would 1) choose initial set of safekeepers; 2) write to the db initial
|
||||
`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
|
||||
case of conflict; 3) create timeline on the majority of safekeepers (already
|
||||
created is ok).
|
||||
Timeline creation/deletion will work through already existing POST and DELETE
|
||||
`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they
|
||||
succeed. See next section on the implementation details.
|
||||
|
||||
We don't want to block timeline creation when one safekeeper is down. Currently
|
||||
this is solved by compute implicitly creating timeline on any safekeeper it is
|
||||
connected to. This creates ugly timeline state on safekeeper when timeline is
|
||||
created, but start LSN is not defined yet. It would be nice to remove this; to
|
||||
do that, controller can in the background retry to create timeline on
|
||||
safekeeper(s) which missed that during initial creation call. It can do that
|
||||
through `pull_timeline` from majority so it doesn't need to remember
|
||||
`parent_lsn` in its db.
|
||||
|
||||
Timeline deletion removes the row from the db and forwards deletion to the
|
||||
current configuration members. Without additional actions deletions might leak,
|
||||
see below on this; initially let's ignore these, reporting to cplane success if
|
||||
at least one safekeeper deleted the timeline (this will remove s3 data).
|
||||
We don't want to block timeline creation/deletion when one safekeeper is down.
|
||||
Currently this is crutched by compute implicitly creating timeline on any
|
||||
safekeeper it is connected to. This creates ugly timeline state on safekeeper
|
||||
when timeline is created, but start LSN is not defined yet. Next section
|
||||
describes dealing with this.
|
||||
|
||||
Tenant deletion repeats timeline deletion for all timelines.
|
||||
|
||||
@@ -395,26 +391,6 @@ Similar call should be added for the tenant.
|
||||
It would be great to have some way of subscribing to the results (apart from
|
||||
looking at logs/metrics).
|
||||
|
||||
Migration is executed as described above. One subtlety is that (local) deletion on
|
||||
source safekeeper might fail, which is not a problem if we are going to
|
||||
decomission the node but leaves garbage otherwise. I'd propose in the first version
|
||||
1) Don't attempt deletion at all if node status is `offline`.
|
||||
2) If it failed, just issue warning.
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
be deleted under generation number if node is not member of current generation.
|
||||
|
||||
Automating this is untrivial; we'd need to register all potential missing
|
||||
deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
|
||||
which switches configurations. Similarly when timeline is fully deleted to
|
||||
prevent cplane operation from blocking when some safekeeper is not available
|
||||
deletion should be also registered.
|
||||
|
||||
One more task pool should infinitely retry notifying control plane about changed
|
||||
safekeeper sets.
|
||||
|
||||
3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
|
||||
current in memory state of the timeline and pending `MigrationRequest`,
|
||||
if any.
|
||||
@@ -423,12 +399,153 @@ safekeeper sets.
|
||||
migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
|
||||
(incrementing generation as always).
|
||||
|
||||
#### API implementation and reconciliation
|
||||
|
||||
For timeline creation/deletion we want to preserve the basic assumption that
|
||||
unreachable minority (1 sk of 3) doesn't block their completion, but eventually
|
||||
we want to finish creation/deletion on nodes which missed it (unless they are
|
||||
removed). Similarly for migration; it may and should finish even though excluded
|
||||
members missed their exclusion. And of course e.g. such pending exclusion on
|
||||
node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As
|
||||
another example, if some node missed timeline creation it clearly must not block
|
||||
migration from it. Hence it is natural to have per safekeeper background
|
||||
reconciler which retries these ops until they succeed. There are 3 possible
|
||||
operation types, and the type is defined by timeline state (membership
|
||||
configuration and whether it is deleted) and safekeeper id: we may need to
|
||||
create timeline on sk (node added), locally delete it (node excluded, somewhat
|
||||
similar to detach) or globally delete it (timeline is deleted).
|
||||
|
||||
Next, on storage controller restart in principle these pending operations can be
|
||||
figured out by comparing safekeepers state against storcon state. But it seems
|
||||
better to me to materialize them in the database; it is not expensive, avoids
|
||||
these startup scans which themselves can fail etc and makes it very easy to see
|
||||
outstanding work directly at the source of truth -- the db. So we can add table
|
||||
`safekeeper_timeline_pending_ops`
|
||||
```
|
||||
table! {
|
||||
// timeline_id, sk_id is primary key
|
||||
safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) {
|
||||
sk_id -> int8,
|
||||
tenant_id -> Varchar,
|
||||
timeline_id -> Varchar,
|
||||
generation -> Int4,
|
||||
op_type -> Varchar,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`op_type` can be `include` (seed from peers and ensure generation is up to
|
||||
date), `exclude` (remove locally) and `delete`. Field is actually not strictly
|
||||
needed as it can be computed from current configuration, but gives more explicit
|
||||
observability.
|
||||
|
||||
`generation` is necessary there because after op is done reconciler must remove
|
||||
it and not remove another row with higher gen which in theory might appear.
|
||||
|
||||
Any insert of row should overwrite (remove) all rows with the same sk and
|
||||
timeline id but lower `generation` as next op makes previous obsolete. Insertion
|
||||
of `op_type` `delete` overwrites all rows.
|
||||
|
||||
About `exclude`: rather than adding explicit safekeeper http endpoint, it is
|
||||
reasonable to reuse membership switch endpoint: if safekeeper is not member
|
||||
of the configuration it locally removes the timeline on the switch. In this case
|
||||
404 should also be considered an 'ok' answer by the caller.
|
||||
|
||||
So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
|
||||
joined with timeline configuration to get current conf (with generation `n`)
|
||||
for the safekeeper and does the jobs, infinitely retrying failures:
|
||||
1) If node is member (`include`):
|
||||
- Check if timeline exists on it, if not, call pull_timeline on it from
|
||||
other members
|
||||
- Call switch configuration to the current
|
||||
2) If node is not member (`exclude`):
|
||||
- Call switch configuration to the current, 404 is ok.
|
||||
3) If timeline is deleted (`delete`), call delete.
|
||||
|
||||
In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and
|
||||
timeline with generation <= `n` if `op_type` is not `delete`.
|
||||
In case 3 also remove `safekeeper_timeline_pending_ops`
|
||||
entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline.
|
||||
|
||||
Let's consider in details how APIs can be implemented from this angle.
|
||||
|
||||
Timeline creation. It is assumed that cplane retries it until success, so all
|
||||
actions must be idempotent. Now, a tricky point here is timeline start LSN. For
|
||||
the initial (tenant creation) call cplane doesn't know it. However, setting
|
||||
start_lsn on safekeepers during creation is a good thing -- it provides a
|
||||
guarantee that walproposer can always find a common point in WAL histories of
|
||||
safekeeper and its own, and so absense of it would be a clear sign of
|
||||
corruption. The following sequence works:
|
||||
1) Create timeline (or observe that it exists) on pageserver,
|
||||
figuring out last_record_lsn in response.
|
||||
2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the
|
||||
db. Note that last_record_lsn returned on the previous step is movable as it
|
||||
changes once ingestion starts, insert must not overwrite it (as well as other
|
||||
fields like membership conf). On the contrary, start_lsn used in the next
|
||||
step must be set to the value in the db. cplane_notified_generation can be set
|
||||
to 1 (initial generation) in insert to avoid notifying cplane about initial
|
||||
conf as cplane will receive it in timeline creation request anyway.
|
||||
3) Issue timeline creation calls to at least majority of safekeepers. Using
|
||||
majority here is not necessary but handy because it guarantees that any live
|
||||
majority will have at least one sk with created timeline and so
|
||||
reconciliation task can use pull_timeline shared with migration instead of
|
||||
create timeline special init case. OFC if timeline is already exists call is
|
||||
ignored.
|
||||
4) For minority of safekeepers which could have missed creation insert
|
||||
entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion
|
||||
because response to cplane is sent only after it has happened, and cplane
|
||||
retries the call until 200 response.
|
||||
|
||||
There is a small question how request handler (timeline creation in this
|
||||
case) would interact with per sk reconciler. As always I prefer to do the
|
||||
simplest possible thing and here it seems to be just waking it up so it
|
||||
re-reads the db for work to do. Passing work in memory is faster, but
|
||||
that shouldn't matter, and path to scan db for work will exist anyway,
|
||||
simpler to reuse it.
|
||||
|
||||
For pg version / wal segment size: while we may persist them in `timelines`
|
||||
table, it is not necessary as initial creation at step 3 can take them from
|
||||
pageserver or cplane creation call and later pull_timeline will carry them
|
||||
around.
|
||||
|
||||
Timeline migration.
|
||||
1) CAS to the db to create joint conf, and in the same transaction create
|
||||
`safekeeper_timeline_pending_ops` `include` entries to initialize new members
|
||||
as well as deliver this conf to current ones; poke per sk reconcilers to work
|
||||
on it. Also any conf change should also poke cplane notifier task(s).
|
||||
2) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Task should get wakeups from per sk reconcilers because
|
||||
conf switch is required for advancement; however retries should be sleep
|
||||
based as well as LSN advancement might be needed, though in happy path
|
||||
it isn't. To see whether further transition is possible on wakup migration
|
||||
executor polls safekeepers per the algorithm. CAS creating new conf with only
|
||||
new members should again insert entries to `safekeeper_timeline_pending_ops`
|
||||
to switch them there, as well as `exclude` rows to remove timeline from
|
||||
old members.
|
||||
|
||||
Timeline deletion: just set `deleted_at` on the timeline row and insert
|
||||
`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
|
||||
per sk reconcilers.
|
||||
|
||||
When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
|
||||
for it must be cleared in the same transaction.
|
||||
|
||||
One more task pool should infinitely retry notifying control plane about changed
|
||||
safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
|
||||
|
||||
#### Dealing with multiple instances of storage_controller
|
||||
|
||||
Operations described above executed concurrently might create some errors but do
|
||||
not prevent progress, so while we normally don't want to run multiple instances
|
||||
of storage_controller it is fine to have it temporarily, e.g. during redeploy.
|
||||
|
||||
To harden against some controller instance creating some work in
|
||||
`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
|
||||
the job per sk reconcilers apart from explicit wakups should scan for work
|
||||
periodically. It is possible to remove that though if all db updates are
|
||||
protected with leadership token/term -- then such scans are needed only after
|
||||
leadership is acquired.
|
||||
|
||||
Any interactions with db update in-memory controller state, e.g. if migration
|
||||
request failed because different one is in progress, controller remembers that
|
||||
and tries to finish it.
|
||||
@@ -545,7 +662,7 @@ Aurora does this but similarly I don't think this is needed.
|
||||
|
||||
We should use Compute <-> safekeeper protocol change to include other (long
|
||||
yearned) modifications:
|
||||
- send data in network order to make arm work.
|
||||
- send data in network order without putting whole structs to be arch independent
|
||||
- remove term_start_lsn from AppendRequest
|
||||
- add horizon to TermHistory
|
||||
- add to ProposerGreeting number of connection from this wp to sk
|
||||
|
||||
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize, Serializer};
|
||||
|
||||
use crate::{
|
||||
privilege::Privilege,
|
||||
spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role},
|
||||
spec::{Database, ExtVersion, PgIdent, Role},
|
||||
};
|
||||
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
@@ -135,26 +135,6 @@ pub struct CatalogObjects {
|
||||
pub databases: Vec<Database>,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
/// This is not actually a compute API response, so consider moving
|
||||
/// to a different place.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ControlPlaneSpecResponse {
|
||||
pub spec: Option<ComputeSpec>,
|
||||
pub status: ControlPlaneComputeStatus,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ControlPlaneComputeStatus {
|
||||
// Compute is known to control-plane, but it's not
|
||||
// yet attached to any timeline / endpoint.
|
||||
Empty,
|
||||
// Compute is attached to some timeline / endpoint and
|
||||
// should be able to start with provided spec.
|
||||
Attached,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct InstalledExtension {
|
||||
pub extname: String,
|
||||
|
||||
@@ -204,14 +204,16 @@ impl RemoteExtSpec {
|
||||
|
||||
// Check if extension is present in public or custom.
|
||||
// If not, then it is not allowed to be used by this compute.
|
||||
if let Some(public_extensions) = &self.public_extensions {
|
||||
if !public_extensions.contains(&real_ext_name.to_string()) {
|
||||
if let Some(custom_extensions) = &self.custom_extensions {
|
||||
if !custom_extensions.contains(&real_ext_name.to_string()) {
|
||||
return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
if !self
|
||||
.public_extensions
|
||||
.as_ref()
|
||||
.is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
|
||||
&& !self
|
||||
.custom_extensions
|
||||
.as_ref()
|
||||
.is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
|
||||
{
|
||||
return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
|
||||
}
|
||||
|
||||
match self.extension_data.get(real_ext_name) {
|
||||
@@ -340,6 +342,102 @@ mod tests {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
|
||||
#[test]
|
||||
fn allow_installing_remote_extensions() {
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": null,
|
||||
"custom_extensions": null,
|
||||
"library_index": {},
|
||||
"extension_data": {},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect_err("Extension should not be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": [],
|
||||
"custom_extensions": null,
|
||||
"library_index": {},
|
||||
"extension_data": {},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect_err("Extension should not be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": [],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"ext": "ext"
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect_err("Extension should not be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": [],
|
||||
"custom_extensions": ["ext"],
|
||||
"library_index": {
|
||||
"ext": "ext"
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": ["ext"],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"extlib": "ext",
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
|
||||
// test library index for the case when library name
|
||||
// doesn't match the extension name
|
||||
rspec
|
||||
.get_ext("extlib", true, "latest", "v17")
|
||||
.expect("Library should be found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
|
||||
@@ -94,6 +94,7 @@ pub struct ConfigToml {
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub background_task_maximum_delay: Duration,
|
||||
pub use_compaction_semaphore: bool,
|
||||
pub control_plane_api: Option<reqwest::Url>,
|
||||
pub control_plane_api_token: Option<String>,
|
||||
pub control_plane_emergency_mode: bool,
|
||||
@@ -121,6 +122,7 @@ pub struct ConfigToml {
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
pub page_service_pipelining: PageServicePipeliningConfig,
|
||||
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
pub enable_read_path_debugging: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -470,6 +472,7 @@ impl Default for ConfigToml {
|
||||
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
|
||||
)
|
||||
.unwrap()),
|
||||
use_compaction_semaphore: false,
|
||||
|
||||
control_plane_api: (None),
|
||||
control_plane_api_token: (None),
|
||||
@@ -510,6 +513,11 @@ impl Default for ConfigToml {
|
||||
} else {
|
||||
GetVectoredConcurrentIo::SidecarTask
|
||||
},
|
||||
enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
|
||||
Some(true)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,7 +76,15 @@ impl Conf {
|
||||
let mut cmd = Command::new(path);
|
||||
cmd.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
|
||||
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
|
||||
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?)
|
||||
.env(
|
||||
"ASAN_OPTIONS",
|
||||
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
|
||||
)
|
||||
.env(
|
||||
"UBSAN_OPTIONS",
|
||||
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
|
||||
);
|
||||
Ok(cmd)
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +64,14 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", library_search_path)
|
||||
.env("DYLD_LIBRARY_PATH", library_search_path)
|
||||
.env(
|
||||
"ASAN_OPTIONS",
|
||||
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
|
||||
)
|
||||
.env(
|
||||
"UBSAN_OPTIONS",
|
||||
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
|
||||
)
|
||||
.stdin(std::process::Stdio::null())
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
|
||||
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
@@ -45,11 +45,11 @@ impl RemoteStorageKind {
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
/// Helper to fetch the configured concurrency limit.
|
||||
pub fn concurrency_limit(&self) -> Option<usize> {
|
||||
pub fn concurrency_limit(&self) -> usize {
|
||||
match &self.storage {
|
||||
RemoteStorageKind::LocalFs { .. } => None,
|
||||
RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
|
||||
RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
|
||||
RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
|
||||
RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
|
||||
RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// Here, a limit of max 20k concurrent connections was noted.
|
||||
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
|
||||
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// Set this limit analogously to the S3 limit.
|
||||
///
|
||||
/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
|
||||
/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
|
||||
/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
|
||||
pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
@@ -39,7 +39,7 @@ function initdb_with_args {
|
||||
;;
|
||||
esac
|
||||
|
||||
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
|
||||
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}"
|
||||
}
|
||||
|
||||
rm -fr "$DATA_DIR"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::Future;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -29,6 +30,11 @@ pub async fn exponential_backoff(
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration {
|
||||
let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
Duration::from_secs_f64(seconds)
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
|
||||
@@ -8,19 +8,22 @@ use strum_macros::{EnumString, VariantNames};
|
||||
/// Logs a critical error, similarly to `tracing::error!`. This will:
|
||||
///
|
||||
/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
|
||||
/// * Trigger a pageable alert (via the metric below).
|
||||
/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
|
||||
/// * Trigger a pageable alert (via the metric above).
|
||||
/// * In debug builds, panic the process.
|
||||
///
|
||||
/// When including errors in the message, please use {err:?} to include the error cause and original
|
||||
/// backtrace.
|
||||
#[macro_export]
|
||||
macro_rules! critical {
|
||||
($($arg:tt)*) => {
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) {
|
||||
panic!($($arg)*);
|
||||
}
|
||||
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
|
||||
};
|
||||
}};
|
||||
}
|
||||
|
||||
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
|
||||
@@ -140,6 +140,10 @@ pub struct PageServerConf {
|
||||
/// not terrible.
|
||||
pub background_task_maximum_delay: Duration,
|
||||
|
||||
/// If true, use a separate semaphore for compaction tasks instead of the common background task
|
||||
/// semaphore. Defaults to false.
|
||||
pub use_compaction_semaphore: bool,
|
||||
|
||||
pub control_plane_api: Option<Url>,
|
||||
|
||||
/// JWT token for use with the control plane API.
|
||||
@@ -193,6 +197,10 @@ pub struct PageServerConf {
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
|
||||
pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
|
||||
|
||||
/// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
|
||||
/// files read.
|
||||
pub enable_read_path_debugging: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -332,6 +340,7 @@ impl PageServerConf {
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
use_compaction_semaphore,
|
||||
control_plane_api,
|
||||
control_plane_api_token,
|
||||
control_plane_emergency_mode,
|
||||
@@ -355,6 +364,7 @@ impl PageServerConf {
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -385,6 +395,7 @@ impl PageServerConf {
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
use_compaction_semaphore,
|
||||
control_plane_api,
|
||||
control_plane_emergency_mode,
|
||||
heatmap_upload_concurrency,
|
||||
@@ -440,6 +451,7 @@ impl PageServerConf {
|
||||
.unwrap_or_default(),
|
||||
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@@ -8,7 +8,6 @@ use std::time::Duration;
|
||||
|
||||
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
@@ -463,45 +462,18 @@ impl DeletionQueueClient {
|
||||
///
|
||||
/// The `current_generation` is the generation of this pageserver's current attachment. The
|
||||
/// generations in `layers` are the generations in which those layers were written.
|
||||
pub(crate) async fn push_layers(
|
||||
pub(crate) fn push_layers(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
current_generation: Generation,
|
||||
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
if current_generation.is_none() {
|
||||
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
||||
// None generations are not valid for attached tenants: they must always be attached in
|
||||
// a known generation. None generations are still permitted for layers in the index because
|
||||
// they may be historical.
|
||||
assert!(!current_generation.is_none());
|
||||
|
||||
let mut layer_paths = Vec::new();
|
||||
for (layer, meta) in layers {
|
||||
layer_paths.push(remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
meta.shard,
|
||||
&layer,
|
||||
meta.generation,
|
||||
));
|
||||
}
|
||||
self.push_immediate(layer_paths).await?;
|
||||
return self.flush_immediate().await;
|
||||
}
|
||||
|
||||
self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
|
||||
}
|
||||
|
||||
/// When a Tenant has a generation, push_layers is always synchronous because
|
||||
/// the ListValidator channel is an unbounded channel.
|
||||
///
|
||||
/// This can be merged into push_layers when we remove the Generation-less mode
|
||||
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
|
||||
pub(crate) fn push_layers_sync(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
current_generation: Generation,
|
||||
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
metrics::DELETION_QUEUE
|
||||
.keys_submitted
|
||||
.inc_by(layers.len() as u64);
|
||||
@@ -957,14 +929,12 @@ mod test {
|
||||
|
||||
// File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
|
||||
info!("Pushing");
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
|
||||
)?;
|
||||
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
||||
|
||||
assert_local_files(&[], &deletion_prefix);
|
||||
@@ -1017,14 +987,12 @@ mod test {
|
||||
assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
|
||||
|
||||
tracing::debug!("Pushing...");
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
stale_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
stale_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
// We enqueued the operation in a stale generation: it should have failed validation
|
||||
tracing::debug!("Flushing...");
|
||||
@@ -1032,14 +1000,12 @@ mod test {
|
||||
assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
|
||||
|
||||
tracing::debug!("Pushing...");
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
latest_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
latest_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
// We enqueued the operation in a fresh generation: it should have passed validation
|
||||
tracing::debug!("Flushing...");
|
||||
@@ -1074,28 +1040,24 @@ mod test {
|
||||
// generation gets that treatment)
|
||||
let remote_layer_file_name_historical =
|
||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation.previous(),
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation.previous(),
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
// Inject a deletion in the generation before generation_now: after restart,
|
||||
// this deletion should get executed, because we execute deletions in the
|
||||
// immediately previous generation on the same node.
|
||||
let remote_layer_file_name_previous =
|
||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
client.flush().await?;
|
||||
assert_remote_files(
|
||||
@@ -1139,6 +1101,7 @@ pub(crate) mod mock {
|
||||
use tracing::info;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
pub struct ConsumerState {
|
||||
|
||||
@@ -61,6 +61,7 @@ use crate::{
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
|
||||
tasks::sleep_random,
|
||||
},
|
||||
CancellableTask, DiskUsageEvictionTask,
|
||||
};
|
||||
@@ -210,14 +211,8 @@ async fn disk_usage_eviction_task(
|
||||
info!("disk usage based eviction task finishing");
|
||||
};
|
||||
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
if random_init_delay(task_config.period, &cancel)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
return;
|
||||
}
|
||||
if sleep_random(task_config.period, &cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut iteration_no = 0;
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use enum_map::EnumMap;
|
||||
use enum_map::{Enum as _, EnumMap};
|
||||
use futures::Future;
|
||||
use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
@@ -32,6 +32,7 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::pgdatadir_mapping::DatadirModificationStats;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
@@ -103,7 +104,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
@@ -235,7 +236,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
|
||||
|
||||
GetVectoredLatency {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
|
||||
let task_kind = TaskKind::from_usize(task_kind_idx);
|
||||
|
||||
if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
|
||||
let task_kind = task_kind.into();
|
||||
@@ -258,7 +259,7 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
|
||||
|
||||
ScanLatency {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
|
||||
let task_kind = TaskKind::from_usize(task_kind_idx);
|
||||
|
||||
if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
|
||||
let task_kind = task_kind.into();
|
||||
@@ -299,10 +300,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
|
||||
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
|
||||
let task_kind = TaskKind::from_usize(task_kind);
|
||||
let task_kind: &'static str = task_kind.into();
|
||||
EnumMap::from_array(std::array::from_fn(|content_kind| {
|
||||
let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
|
||||
let content_kind = PageContentKind::from_usize(content_kind);
|
||||
let content_kind: &'static str = content_kind.into();
|
||||
PageCacheMetricsForTaskKind {
|
||||
read_accesses_immutable: {
|
||||
@@ -1912,7 +1913,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy
|
||||
|
||||
ComputeCommandCounters {
|
||||
map: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
|
||||
let command = ComputeCommandKind::from_usize(i);
|
||||
let command_str: &'static str = command.into();
|
||||
inner.with_label_values(&[command_str])
|
||||
})),
|
||||
@@ -2212,11 +2213,13 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
|
||||
pub struct BackgroundLoopSemaphoreMetrics {
|
||||
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
|
||||
durations: EnumMap<BackgroundLoopKind, Counter>,
|
||||
durations: EnumMap<BackgroundLoopKind, Histogram>,
|
||||
waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
|
||||
running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
|
||||
}
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|
||||
|| {
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
|
||||
Lazy::new(|| {
|
||||
let counters = register_int_counter_pair_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
@@ -2226,45 +2229,101 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let durations = register_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_duration_seconds",
|
||||
"Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
|
||||
let durations = register_histogram_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_seconds",
|
||||
"Seconds spent waiting on background loop semaphore acquisition",
|
||||
&["task"],
|
||||
vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let waiting_tasks = register_int_gauge_vec!(
|
||||
"pageserver_background_loop_semaphore_waiting_tasks",
|
||||
"Number of background loop tasks waiting for semaphore",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let running_tasks = register_int_gauge_vec!(
|
||||
"pageserver_background_loop_semaphore_running_tasks",
|
||||
"Number of background loop tasks running concurrently",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
BackgroundLoopSemaphoreMetrics {
|
||||
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
counters: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = BackgroundLoopKind::from_usize(i);
|
||||
counters.with_label_values(&[kind.into()])
|
||||
})),
|
||||
durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
durations: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = BackgroundLoopKind::from_usize(i);
|
||||
durations.with_label_values(&[kind.into()])
|
||||
})),
|
||||
waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = BackgroundLoopKind::from_usize(i);
|
||||
waiting_tasks.with_label_values(&[kind.into()])
|
||||
})),
|
||||
running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = BackgroundLoopKind::from_usize(i);
|
||||
running_tasks.with_label_values(&[kind.into()])
|
||||
})),
|
||||
}
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
impl BackgroundLoopSemaphoreMetrics {
|
||||
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
|
||||
struct Record<'a> {
|
||||
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
||||
task: BackgroundLoopKind,
|
||||
_counter_guard: metrics::IntCounterPairGuard,
|
||||
start: Instant,
|
||||
}
|
||||
impl Drop for Record<'_> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed().as_secs_f64();
|
||||
self.metrics.durations[self.task].inc_by(elapsed);
|
||||
}
|
||||
}
|
||||
Record {
|
||||
metrics: self,
|
||||
/// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
|
||||
/// semaphore is acquired, and drop it when the task completes or is cancelled.
|
||||
pub(crate) fn record(
|
||||
&self,
|
||||
task: BackgroundLoopKind,
|
||||
) -> BackgroundLoopSemaphoreMetricsRecorder {
|
||||
BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
|
||||
}
|
||||
}
|
||||
|
||||
/// Records metrics for a background task.
|
||||
pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
|
||||
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
||||
task: BackgroundLoopKind,
|
||||
start: Instant,
|
||||
wait_counter_guard: Option<metrics::IntCounterPairGuard>,
|
||||
}
|
||||
|
||||
impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
|
||||
/// Starts recording semaphore metrics, by recording wait time and incrementing
|
||||
/// `wait_start_count` and `waiting_tasks`.
|
||||
fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
|
||||
metrics.waiting_tasks[task].inc();
|
||||
Self {
|
||||
metrics,
|
||||
task,
|
||||
_counter_guard: self.counters[task].guard(),
|
||||
start: Instant::now(),
|
||||
wait_counter_guard: Some(metrics.counters[task].guard()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Signals that the semaphore has been acquired, and updates relevant metrics.
|
||||
pub fn acquired(&mut self) -> Duration {
|
||||
let waited = self.start.elapsed();
|
||||
self.wait_counter_guard.take().expect("already acquired");
|
||||
self.metrics.durations[self.task].observe(waited.as_secs_f64());
|
||||
self.metrics.waiting_tasks[self.task].dec();
|
||||
self.metrics.running_tasks[self.task].inc();
|
||||
waited
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
|
||||
/// The task either completed or was cancelled.
|
||||
fn drop(&mut self) {
|
||||
if self.wait_counter_guard.take().is_some() {
|
||||
// Waiting.
|
||||
self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
|
||||
self.metrics.waiting_tasks[self.task].dec();
|
||||
} else {
|
||||
// Running.
|
||||
self.metrics.running_tasks[self.task].dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2378,11 +2437,40 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_observed: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) values_committed_metadata_images: IntCounter,
|
||||
pub(crate) values_committed_metadata_deltas: IntCounter,
|
||||
pub(crate) values_committed_data_images: IntCounter,
|
||||
pub(crate) values_committed_data_deltas: IntCounter,
|
||||
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
|
||||
pub(crate) clear_vm_bits_unknown: IntCounterVec,
|
||||
}
|
||||
|
||||
impl WalIngestMetrics {
|
||||
pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
|
||||
if stats.metadata_images > 0 {
|
||||
self.values_committed_metadata_images
|
||||
.inc_by(stats.metadata_images);
|
||||
}
|
||||
if stats.metadata_deltas > 0 {
|
||||
self.values_committed_metadata_deltas
|
||||
.inc_by(stats.metadata_deltas);
|
||||
}
|
||||
if stats.data_images > 0 {
|
||||
self.values_committed_data_images.inc_by(stats.data_images);
|
||||
}
|
||||
if stats.data_deltas > 0 {
|
||||
self.values_committed_data_deltas.inc_by(stats.data_deltas);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
let values_committed = register_int_counter_vec!(
|
||||
"pageserver_wal_ingest_values_committed",
|
||||
"Number of values committed to pageserver storage from WAL records",
|
||||
&["class", "kind"],
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
WalIngestMetrics {
|
||||
bytes_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_bytes_received",
|
||||
@@ -2409,17 +2497,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
|
||||
values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
|
||||
values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
|
||||
values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
|
||||
gap_blocks_zeroed_on_rel_extend: register_int_counter!(
|
||||
"pageserver_gap_blocks_zeroed_on_rel_extend",
|
||||
"Total number of zero gap blocks written on relation extends"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
clear_vm_bits_unknown: register_int_counter_vec!(
|
||||
"pageserver_wal_ingest_clear_vm_bits_unknown",
|
||||
"Number of ignored ClearVmBits operations due to unknown pages/relations",
|
||||
&["entity"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
});
|
||||
|
||||
@@ -2486,7 +2572,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
|
||||
|
||||
pub(crate) struct WalRedoProcessCounters {
|
||||
pub(crate) started: IntCounter,
|
||||
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
|
||||
pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
|
||||
pub(crate) active_stderr_logger_tasks_started: IntCounter,
|
||||
pub(crate) active_stderr_logger_tasks_finished: IntCounter,
|
||||
}
|
||||
@@ -2528,7 +2614,7 @@ impl Default for WalRedoProcessCounters {
|
||||
Self {
|
||||
started,
|
||||
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
|
||||
let cause = WalRedoKillCause::from_usize(i);
|
||||
let cause_str: &'static str = cause.into();
|
||||
killed.with_label_values(&[cause_str])
|
||||
})),
|
||||
|
||||
@@ -489,7 +489,6 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
let timeline = tenant_shard
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(GetActiveTimelineError::Timeline)?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
@@ -774,11 +773,11 @@ impl PageServerHandler {
|
||||
|
||||
let batched_msg = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelExists,
|
||||
@@ -793,11 +792,10 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelSize,
|
||||
@@ -812,11 +810,10 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetDbSize,
|
||||
@@ -831,11 +828,10 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetSlruSegment,
|
||||
@@ -850,12 +846,20 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
|
||||
// avoid a somewhat costly Span::record() by constructing the entire span in one go.
|
||||
macro_rules! mkspan {
|
||||
(before shard routing) => {{
|
||||
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
|
||||
}};
|
||||
($shard_id:expr) => {{
|
||||
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! respond_error {
|
||||
($error:expr) => {{
|
||||
($span:expr, $error:expr) => {{
|
||||
let error = BatchedFeMessage::RespondError {
|
||||
span,
|
||||
span: $span,
|
||||
error: BatchedPageStreamError {
|
||||
req: req.hdr,
|
||||
err: $error,
|
||||
@@ -868,27 +872,35 @@ impl PageServerHandler {
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
let shard = match timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Page(key))
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await
|
||||
{
|
||||
Ok(tl) => tl,
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
//
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return respond_error!(PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into()
|
||||
));
|
||||
}
|
||||
Err(e) => {
|
||||
return respond_error!(e.into());
|
||||
let span = mkspan!(before shard routing);
|
||||
match e {
|
||||
GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
//
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return respond_error!(
|
||||
span,
|
||||
PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into()
|
||||
)
|
||||
);
|
||||
}
|
||||
e => {
|
||||
return respond_error!(span, e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
let span = mkspan!(shard.tenant_shard_id.shard_slug());
|
||||
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
@@ -910,7 +922,7 @@ impl PageServerHandler {
|
||||
{
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
return respond_error!(e);
|
||||
return respond_error!(span, e);
|
||||
}
|
||||
};
|
||||
BatchedFeMessage::GetPage {
|
||||
@@ -922,11 +934,10 @@ impl PageServerHandler {
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessage::Test(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer =
|
||||
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
|
||||
.await?;
|
||||
@@ -1190,6 +1201,29 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
// We purposefully don't count flush time into the smgr operaiton timer.
|
||||
//
|
||||
// The reason is that current compute client will not perform protocol processing
|
||||
// if the postgres backend process is doing things other than `->smgr_read()`.
|
||||
// This is especially the case for prefetch.
|
||||
//
|
||||
// If the compute doesn't read from the connection, eventually TCP will backpressure
|
||||
// all the way into our flush call below.
|
||||
//
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
//
|
||||
// We put each response in the batch onto the wire in a separate pgb_writer.flush()
|
||||
// call, which (all unmeasured) adds syscall overhead but reduces time to first byte
|
||||
// and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
|
||||
// TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
|
||||
//
|
||||
// Since we're flushing multiple times in the loop, but only have access to the per-op
|
||||
// timers inside the loop, we capture the flush start time here and reuse it to finish
|
||||
// each op timer.
|
||||
let flushing_start_time = Instant::now();
|
||||
|
||||
// Map handler result to protocol behavior.
|
||||
// Some handler errors cause exit from pagestream protocol.
|
||||
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
|
||||
@@ -1238,21 +1272,9 @@ impl PageServerHandler {
|
||||
&response_msg.serialize(protocol_version),
|
||||
))?;
|
||||
|
||||
// We purposefully don't count flush time into the timer.
|
||||
//
|
||||
// The reason is that current compute client will not perform protocol processing
|
||||
// if the postgres backend process is doing things other than `->smgr_read()`.
|
||||
// This is especially the case for prefetch.
|
||||
//
|
||||
// If the compute doesn't read from the connection, eventually TCP will backpressure
|
||||
// all the way into our flush call below.
|
||||
//
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
let flushing_timer = timer.map(|mut timer| {
|
||||
timer
|
||||
.observe_execution_end_flush_start(Instant::now())
|
||||
.observe_execution_end_flush_start(flushing_start_time)
|
||||
.expect("we are the first caller")
|
||||
});
|
||||
|
||||
@@ -1280,8 +1302,6 @@ impl PageServerHandler {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
// and log the info! line inside the request span
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -1342,7 +1362,7 @@ impl PageServerHandler {
|
||||
.take()
|
||||
.expect("implementation error: timeline_handles should not be locked");
|
||||
|
||||
let request_span = info_span!("request", shard_id = tracing::field::Empty);
|
||||
let request_span = info_span!("request");
|
||||
let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
|
||||
PageServicePipeliningConfig::Pipelined(pipelining_config) => {
|
||||
self.handle_pagerequests_pipelined(
|
||||
@@ -1692,7 +1712,7 @@ impl PageServerHandler {
|
||||
// to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
|
||||
if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
|
||||
let gc_info = &timeline.gc_info.read().unwrap();
|
||||
if !gc_info.leases.contains_key(&request_lsn) {
|
||||
if !gc_info.lsn_covered_by_lease(request_lsn) {
|
||||
return Err(
|
||||
PageStreamError::BadRequest(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
@@ -2036,6 +2056,13 @@ impl PageServerHandler {
|
||||
.unwrap()
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
|
||||
if timeline.is_archived() == Some(true) {
|
||||
// TODO after a grace period, turn this log line into a hard error
|
||||
tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
|
||||
//return Err(QueryError::NotFound("timeline is archived".into()))
|
||||
}
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
|
||||
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -612,11 +612,18 @@ impl Timeline {
|
||||
pausable_failpoint!("find-lsn-for-timestamp-pausable");
|
||||
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let gc_cutoff_planned = {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
gc_info.min_cutoff()
|
||||
};
|
||||
// Usually the planned cutoff is newer than the cutoff of the last gc run,
|
||||
// but let's be defensive.
|
||||
let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard);
|
||||
// We use this method to figure out the branching LSN for the new branch, but the
|
||||
// GC cutoff could be before the branching point and we cannot create a new branch
|
||||
// with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
|
||||
// on the safe side.
|
||||
let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
|
||||
let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn());
|
||||
let max_lsn = self.get_last_record_lsn();
|
||||
|
||||
// LSNs are always 8-byte aligned. low/mid/high represent the
|
||||
@@ -1297,6 +1304,26 @@ impl DatadirModification<'_> {
|
||||
.is_some_and(|b| b.has_data())
|
||||
}
|
||||
|
||||
/// Returns statistics about the currently pending modifications.
|
||||
pub(crate) fn stats(&self) -> DatadirModificationStats {
|
||||
let mut stats = DatadirModificationStats::default();
|
||||
for (_, _, value) in self.pending_metadata_pages.values().flatten() {
|
||||
match value {
|
||||
Value::Image(_) => stats.metadata_images += 1,
|
||||
Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
|
||||
Value::WalRecord(_) => stats.metadata_deltas += 1,
|
||||
}
|
||||
}
|
||||
for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
|
||||
match valuemeta {
|
||||
ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
|
||||
ValueMeta::Serialized(_) => stats.data_deltas += 1,
|
||||
ValueMeta::Observed(_) => {}
|
||||
}
|
||||
}
|
||||
stats
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
@@ -2317,6 +2344,15 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics for a DatadirModification.
|
||||
#[derive(Default)]
|
||||
pub struct DatadirModificationStats {
|
||||
pub metadata_images: u64,
|
||||
pub metadata_deltas: u64,
|
||||
pub data_images: u64,
|
||||
pub data_deltas: u64,
|
||||
}
|
||||
|
||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||
///
|
||||
|
||||
@@ -328,8 +328,8 @@ pub enum TaskKind {
|
||||
// Eviction. One per timeline.
|
||||
Eviction,
|
||||
|
||||
// Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
|
||||
IngestHousekeeping,
|
||||
// Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.).
|
||||
TenantHousekeeping,
|
||||
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
@@ -20,6 +20,7 @@ use chrono::NaiveDateTime;
|
||||
use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::CompactInfoResponse;
|
||||
use pageserver_api::models::LsnLease;
|
||||
@@ -55,6 +56,7 @@ use timeline::CompactOptions;
|
||||
use timeline::ShutdownMode;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::watch;
|
||||
use tokio::sync::Notify;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -349,6 +351,9 @@ pub struct Tenant {
|
||||
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
|
||||
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
|
||||
|
||||
/// Signals the tenant compaction loop that there is L0 compaction work to be done.
|
||||
pub(crate) l0_compaction_trigger: Arc<Notify>,
|
||||
|
||||
/// Scheduled gc-compaction tasks.
|
||||
scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
|
||||
|
||||
@@ -1690,12 +1695,7 @@ impl Tenant {
|
||||
timeline_id,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
self.get_timeline_resources_for(remote_client),
|
||||
LoadTimelineCause::Attach,
|
||||
ctx,
|
||||
)
|
||||
@@ -3088,32 +3088,28 @@ impl Tenant {
|
||||
Ok(rx)
|
||||
}
|
||||
|
||||
// Call through to all timelines to freeze ephemeral layers if needed. Usually
|
||||
// this happens during ingest: this background housekeeping is for freezing layers
|
||||
// that are open but haven't been written to for some time.
|
||||
async fn ingest_housekeeping(&self) {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// compactions. We don't want to block everything else while the
|
||||
// compaction runs.
|
||||
let timelines = {
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter_map(|timeline| {
|
||||
if timeline.is_active() {
|
||||
Some(timeline.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
/// Performs periodic housekeeping, via the tenant housekeeping background task.
|
||||
async fn housekeeping(&self) {
|
||||
// Call through to all timelines to freeze ephemeral layers as needed. This usually happens
|
||||
// during ingest, but we don't want idle timelines to hold open layers for too long.
|
||||
let timelines = self
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tli| tli.is_active())
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
|
||||
for timeline in &timelines {
|
||||
for timeline in timelines {
|
||||
timeline.maybe_freeze_ephemeral_layer().await;
|
||||
}
|
||||
|
||||
// Shut down walredo if idle.
|
||||
const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180);
|
||||
if let Some(ref walredo_mgr) = self.walredo_mgr {
|
||||
walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
|
||||
@@ -4115,6 +4111,7 @@ impl Tenant {
|
||||
// use an extremely long backoff.
|
||||
Some(Duration::from_secs(3600 * 24)),
|
||||
)),
|
||||
l0_compaction_trigger: Arc::new(Notify::new()),
|
||||
scheduled_compaction_tasks: Mutex::new(Default::default()),
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
|
||||
@@ -4642,22 +4639,26 @@ impl Tenant {
|
||||
|
||||
// check against last actual 'latest_gc_cutoff' first
|
||||
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn,
|
||||
))
|
||||
.map_err(CreateTimelineError::AncestorLsn)?;
|
||||
|
||||
// and then the planned GC cutoff
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
let cutoff = gc_info.min_cutoff();
|
||||
if start_lsn < cutoff {
|
||||
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
|
||||
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
|
||||
)));
|
||||
let planned_cutoff = gc_info.min_cutoff();
|
||||
if gc_info.lsn_covered_by_lease(start_lsn) {
|
||||
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
|
||||
} else {
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn,
|
||||
))
|
||||
.map_err(CreateTimelineError::AncestorLsn)?;
|
||||
|
||||
// and then the planned GC cutoff
|
||||
if start_lsn < planned_cutoff {
|
||||
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
|
||||
"invalid branch start lsn: less than planned GC cutoff {planned_cutoff}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5019,12 +5020,19 @@ impl Tenant {
|
||||
)
|
||||
}
|
||||
|
||||
/// Call this before constructing a timeline, to build its required structures
|
||||
/// Builds required resources for a new timeline.
|
||||
fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
|
||||
let remote_client = self.build_timeline_remote_client(timeline_id);
|
||||
self.get_timeline_resources_for(remote_client)
|
||||
}
|
||||
|
||||
/// Builds timeline resources for the given remote client.
|
||||
fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources {
|
||||
TimelineResources {
|
||||
remote_client: self.build_timeline_remote_client(timeline_id),
|
||||
remote_client,
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
}
|
||||
}
|
||||
@@ -7697,6 +7705,18 @@ mod tests {
|
||||
}
|
||||
|
||||
tline.freeze_and_flush().await?;
|
||||
// Force layers to L1
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceL0Compaction);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if iter % 5 == 0 {
|
||||
let (_, before_delta_file_accessed) =
|
||||
@@ -7709,6 +7729,7 @@ mod tests {
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags.insert(CompactFlags::ForceL0Compaction);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
@@ -8155,6 +8176,8 @@ mod tests {
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
|
||||
tline.force_set_disk_consistent_lsn(Lsn(0x40));
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
@@ -8168,8 +8191,7 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
// Image layers are created at repartition LSN
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
|
||||
.await
|
||||
|
||||
@@ -437,8 +437,7 @@ impl RemoteTimelineClient {
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
@@ -461,8 +460,7 @@ impl RemoteTimelineClient {
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
@@ -484,8 +482,7 @@ impl RemoteTimelineClient {
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
@@ -520,7 +517,7 @@ impl RemoteTimelineClient {
|
||||
if let Ok(queue) = queue_locked.initialized_mut() {
|
||||
let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
|
||||
for d in blocked_deletions {
|
||||
if let Err(e) = self.deletion_queue_client.push_layers_sync(
|
||||
if let Err(e) = self.deletion_queue_client.push_layers(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
self.generation,
|
||||
@@ -2154,7 +2151,6 @@ impl RemoteTimelineClient {
|
||||
self.generation,
|
||||
delete.layers.clone(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,13 +9,14 @@ use crate::{
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
mgr::GetTenantError,
|
||||
mgr::TenantManager,
|
||||
mgr::{GetTenantError, TenantManager},
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
Tenant,
|
||||
},
|
||||
virtual_file::VirtualFile,
|
||||
TEMP_FILE_SUFFIX,
|
||||
};
|
||||
|
||||
use futures::Future;
|
||||
@@ -32,7 +33,10 @@ use super::{
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
|
||||
yielding_loop::yielding_loop,
|
||||
};
|
||||
|
||||
pub(super) async fn heatmap_uploader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
|
||||
}
|
||||
}
|
||||
|
||||
// After a successful upload persist the fresh heatmap to disk.
|
||||
// When restarting, the tenant will read the heatmap from disk
|
||||
// and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
|
||||
// If the heatmap is stale, the additive generation can lead to keeping previously
|
||||
// evicted timelines on the secondarie's disk.
|
||||
let tenant_shard_id = tenant.get_tenant_shard_id();
|
||||
let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
|
||||
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
|
||||
if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
|
||||
tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
|
||||
}
|
||||
|
||||
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
|
||||
|
||||
Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
|
||||
|
||||
@@ -44,7 +44,7 @@ pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::timeline::{GetVectoredError, ReadPath};
|
||||
use super::PageReconstructError;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
@@ -262,6 +262,8 @@ pub(crate) struct ValuesReconstructState {
|
||||
|
||||
pub(crate) io_concurrency: IoConcurrency,
|
||||
num_active_ios: Arc<AtomicUsize>,
|
||||
|
||||
pub(crate) read_path: Option<ReadPath>,
|
||||
}
|
||||
|
||||
/// The level of IO concurrency to be used on the read path
|
||||
@@ -609,6 +611,7 @@ impl ValuesReconstructState {
|
||||
delta_layers_visited: 0,
|
||||
io_concurrency,
|
||||
num_active_ios: Arc::new(AtomicUsize::new(0)),
|
||||
read_path: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,53 +1,81 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as compaction and GC
|
||||
//! This module contains per-tenant background processes, e.g. compaction and GC.
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::cmp::max;
|
||||
use std::future::Future;
|
||||
use std::ops::{ControlFlow, RangeInclusive};
|
||||
use std::pin::pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::Rng;
|
||||
use scopeguard::defer;
|
||||
use tokio::sync::{Semaphore, SemaphorePermit};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::compaction::CompactionOutcome;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use rand::Rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{backoff, completion, pausable_failpoint};
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
use utils::completion::Barrier;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::rate_limit::RateLimit;
|
||||
|
||||
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
|
||||
let permits = usize::max(
|
||||
1,
|
||||
// while a lot of the work is done on spawn_blocking, we still do
|
||||
// repartitioning in the async context. this should give leave us some workers
|
||||
// unblocked to be blocked on other work, hopefully easing any outside visible
|
||||
// effects of restarts.
|
||||
//
|
||||
// 6/8 is a guess; previously we ran with unlimited 8 and more from
|
||||
// spawn_blocking.
|
||||
(total_threads * 3).checked_div(4).unwrap_or(0),
|
||||
);
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(
|
||||
permits < total_threads,
|
||||
"need threads avail for shorter work"
|
||||
);
|
||||
tokio::sync::Semaphore::new(permits)
|
||||
});
|
||||
/// Semaphore limiting concurrent background tasks (across all tenants).
|
||||
///
|
||||
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
|
||||
static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
|
||||
let total_threads = TOKIO_WORKER_THREADS.get();
|
||||
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(permits < total_threads, "need threads for other work");
|
||||
Semaphore::new(permits)
|
||||
});
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
|
||||
/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
|
||||
/// default, see `use_compaction_semaphore`.
|
||||
///
|
||||
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
|
||||
///
|
||||
/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
|
||||
/// to avoid high read amp during heavy write workloads.
|
||||
///
|
||||
/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
|
||||
/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
|
||||
static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
|
||||
let total_threads = TOKIO_WORKER_THREADS.get();
|
||||
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(permits < total_threads, "need threads for other work");
|
||||
Semaphore::new(permits)
|
||||
});
|
||||
|
||||
/// Background jobs.
|
||||
///
|
||||
/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
|
||||
/// do any significant IO.
|
||||
#[derive(
|
||||
Debug,
|
||||
PartialEq,
|
||||
Eq,
|
||||
Clone,
|
||||
Copy,
|
||||
strum_macros::IntoStaticStr,
|
||||
strum_macros::Display,
|
||||
enum_map::Enum,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub(crate) enum BackgroundLoopKind {
|
||||
Compaction,
|
||||
Gc,
|
||||
Eviction,
|
||||
IngestHouseKeeping,
|
||||
TenantHouseKeeping,
|
||||
ConsumptionMetricsCollectMetrics,
|
||||
ConsumptionMetricsSyntheticSizeWorker,
|
||||
InitialLogicalSizeCalculation,
|
||||
@@ -55,36 +83,56 @@ pub(crate) enum BackgroundLoopKind {
|
||||
SecondaryDownload,
|
||||
}
|
||||
|
||||
impl BackgroundLoopKind {
|
||||
fn as_static_str(&self) -> &'static str {
|
||||
self.into()
|
||||
}
|
||||
pub struct BackgroundLoopSemaphorePermit<'a> {
|
||||
_permit: SemaphorePermit<'static>,
|
||||
_recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
|
||||
}
|
||||
|
||||
/// Cancellation safe.
|
||||
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||
/// Acquires a semaphore permit, to limit concurrent background jobs.
|
||||
pub(crate) async fn acquire_concurrency_permit(
|
||||
loop_kind: BackgroundLoopKind,
|
||||
use_compaction_semaphore: bool,
|
||||
_ctx: &RequestContext,
|
||||
) -> tokio::sync::SemaphorePermit<'static> {
|
||||
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
|
||||
) -> BackgroundLoopSemaphorePermit<'static> {
|
||||
// TODO: use a lower threshold and remove the pacer once we resolve some blockage.
|
||||
const WARN_THRESHOLD: Duration = Duration::from_secs(600);
|
||||
static WARN_PACER: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
|
||||
let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
|
||||
|
||||
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
|
||||
pausable_failpoint!("initial-size-calculation-permit-pause");
|
||||
}
|
||||
|
||||
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
|
||||
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
|
||||
Ok(permit) => permit,
|
||||
Err(_closed) => unreachable!("we never close the semaphore"),
|
||||
let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
|
||||
CONCURRENT_COMPACTION_TASKS.acquire().await
|
||||
} else {
|
||||
assert!(!use_compaction_semaphore);
|
||||
CONCURRENT_BACKGROUND_TASKS.acquire().await
|
||||
}
|
||||
.expect("should never close");
|
||||
|
||||
let waited = recorder.acquired();
|
||||
if waited >= WARN_THRESHOLD {
|
||||
let waited = waited.as_secs_f64();
|
||||
WARN_PACER
|
||||
.lock()
|
||||
.unwrap()
|
||||
.call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
|
||||
}
|
||||
|
||||
BackgroundLoopSemaphorePermit {
|
||||
_permit: permit,
|
||||
_recorder: recorder,
|
||||
}
|
||||
}
|
||||
|
||||
/// Start per tenant background loops: compaction and gc.
|
||||
pub fn start_background_loops(
|
||||
tenant: &Arc<Tenant>,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
) {
|
||||
/// Start per tenant background loops: compaction, GC, and ingest housekeeping.
|
||||
pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
|
||||
let tenant_shard_id = tenant.tenant_shard_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Compaction,
|
||||
@@ -93,13 +141,15 @@ pub fn start_background_loops(
|
||||
&format!("compactor for tenant {tenant_shard_id}"),
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
let can_start = can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
_ = Barrier::maybe_wait(can_start) => {}
|
||||
};
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
|
||||
compaction_loop(tenant, cancel)
|
||||
// If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
|
||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
@@ -108,6 +158,7 @@ pub fn start_background_loops(
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::GarbageCollector,
|
||||
@@ -116,13 +167,15 @@ pub fn start_background_loops(
|
||||
&format!("garbage collector for tenant {tenant_shard_id}"),
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
let can_start = can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
_ = Barrier::maybe_wait(can_start) => {}
|
||||
};
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
|
||||
gc_loop(tenant, cancel)
|
||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
.await;
|
||||
@@ -133,21 +186,23 @@ pub fn start_background_loops(
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::IngestHousekeeping,
|
||||
TaskKind::TenantHousekeeping,
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
|
||||
&format!("housekeeping for tenant {tenant_shard_id}"),
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
let can_start = can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
_ = Barrier::maybe_wait(can_start) => {}
|
||||
};
|
||||
ingest_housekeeping_loop(tenant, cancel)
|
||||
.instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
|
||||
tenant_housekeeping_loop(tenant, cancel)
|
||||
.instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
@@ -155,372 +210,293 @@ pub fn start_background_loops(
|
||||
);
|
||||
}
|
||||
|
||||
///
|
||||
/// Compaction task's main loop
|
||||
///
|
||||
/// Compaction task's main loop.
|
||||
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
const BASE_BACKOFF_SECS: f64 = 1.0;
|
||||
const MAX_BACKOFF_SECS: f64 = 300.0;
|
||||
// How many errors we have seen consequtively
|
||||
let mut error_run_count = 0;
|
||||
const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut period = tenant.get_compaction_period();
|
||||
let mut error_run = 0; // consecutive errors
|
||||
|
||||
// Stagger the compaction loop across tenants.
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
if sleep_random(period, &cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
loop {
|
||||
// Recheck that we're still active.
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Refresh the period. If compaction is disabled, check again in a bit.
|
||||
period = tenant.get_compaction_period();
|
||||
if period == Duration::ZERO {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic compaction is disabled");
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
_ = tokio::time::sleep(RECHECK_CONFIG_INTERVAL) => {},
|
||||
_ = cancel.cancelled() => return,
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let period = tenant.get_compaction_period();
|
||||
// Wait for the next compaction run.
|
||||
let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(backoff), if error_run > 0 => {},
|
||||
_ = tokio::time::sleep(period), if error_run == 0 => {},
|
||||
_ = tenant.l0_compaction_trigger.notified(), if error_run == 0 => {},
|
||||
_ = cancel.cancelled() => return,
|
||||
}
|
||||
|
||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
// Run compaction.
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::Compaction,
|
||||
};
|
||||
let IterationResult { output, elapsed } = iteration
|
||||
.run(tenant.compaction_iteration(&cancel, &ctx))
|
||||
.await;
|
||||
|
||||
match output {
|
||||
Ok(outcome) => {
|
||||
error_run = 0;
|
||||
// If there's more compaction work pending, reschedule immediately. This isn't
|
||||
// necessarily L0 compaction, but that's fine for now.
|
||||
//
|
||||
// TODO: differentiate between L0 compaction and other compaction. The former needs
|
||||
// to be responsive, the latter doesn't.
|
||||
if outcome == CompactionOutcome::Pending {
|
||||
tenant.l0_compaction_trigger.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10)
|
||||
} else {
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::Compaction,
|
||||
};
|
||||
|
||||
// Run compaction
|
||||
let IterationResult { output, elapsed } = iteration
|
||||
.run(tenant.compaction_iteration(&cancel, &ctx))
|
||||
.await;
|
||||
match output {
|
||||
Ok(outcome) => {
|
||||
error_run_count = 0;
|
||||
// schedule the next compaction immediately in case there is a pending compaction task
|
||||
sleep_duration = if let CompactionOutcome::Pending = outcome {
|
||||
Duration::ZERO
|
||||
} else {
|
||||
period
|
||||
};
|
||||
}
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
sleep_duration = wait_duration;
|
||||
}
|
||||
}
|
||||
|
||||
// the duration is recorded by performance tests by enabling debug in this function
|
||||
tracing::debug!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"compaction iteration complete"
|
||||
);
|
||||
};
|
||||
|
||||
// Perhaps we did no work and the walredo process has been idle for some time:
|
||||
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
||||
// TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
|
||||
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
|
||||
if let Some(walredo_mgr) = &tenant.walredo_mgr {
|
||||
walredo_mgr.maybe_quiesce(period * 10);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
Err(err) => {
|
||||
error_run += 1;
|
||||
let backoff =
|
||||
exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
|
||||
log_compaction_error(&err, error_run, backoff, cancel.is_cancelled());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// NB: this log entry is recorded by performance tests.
|
||||
debug!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"compaction iteration complete"
|
||||
);
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
fn log_compaction_error(
|
||||
e: &CompactionError,
|
||||
error_run_count: u32,
|
||||
sleep_duration: &std::time::Duration,
|
||||
err: &CompactionError,
|
||||
error_count: u32,
|
||||
sleep_duration: Duration,
|
||||
task_cancelled: bool,
|
||||
) {
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use CompactionError::*;
|
||||
|
||||
enum LooksLike {
|
||||
Info,
|
||||
Error,
|
||||
}
|
||||
let level = match err {
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
let root_cause = err.root_cause();
|
||||
|
||||
let decision = match e {
|
||||
ShuttingDown => None,
|
||||
Offload(_) => Some(LooksLike::Error),
|
||||
_ if task_cancelled => Some(LooksLike::Info),
|
||||
Other(e) => {
|
||||
let root_cause = e.root_cause();
|
||||
|
||||
let is_stopping = {
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
upload_queue || timeline
|
||||
};
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let is_stopping = upload_queue || timeline;
|
||||
|
||||
if is_stopping {
|
||||
Some(LooksLike::Info)
|
||||
Level::INFO
|
||||
} else {
|
||||
Some(LooksLike::Error)
|
||||
Level::ERROR
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match decision {
|
||||
Some(LooksLike::Info) => info!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
|
||||
),
|
||||
Some(LooksLike::Error) => error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
|
||||
),
|
||||
None => {}
|
||||
match level {
|
||||
Level::ERROR => {
|
||||
error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
|
||||
}
|
||||
Level::INFO => {
|
||||
info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
|
||||
}
|
||||
level => unimplemented!("unexpected level {level:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
/// GC task's main loop.
|
||||
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
const MAX_BACKOFF_SECS: f64 = 300.0;
|
||||
// How many errors we have seen consequtively
|
||||
let mut error_run_count = 0;
|
||||
let mut error_run = 0; // consecutive errors
|
||||
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx =
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
|
||||
let mut first = true;
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
}
|
||||
loop {
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
|
||||
let period = tenant.get_gc_period();
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
if first {
|
||||
first = false;
|
||||
|
||||
let delays = async {
|
||||
random_init_delay(period, &cancel).await?;
|
||||
Ok::<_, Cancelled>(())
|
||||
};
|
||||
|
||||
if delays.await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO || gc_horizon == 0 {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::Gc,
|
||||
};
|
||||
// Run gc
|
||||
let IterationResult { output, elapsed: _ } =
|
||||
iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
|
||||
.await;
|
||||
match output {
|
||||
Ok(_) => {
|
||||
error_run_count = 0;
|
||||
sleep_duration = period;
|
||||
}
|
||||
Err(crate::tenant::GcError::TenantCancelled) => {
|
||||
return;
|
||||
}
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
|
||||
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
|
||||
// Timeline was cancelled during gc. We might either be in an event
|
||||
// that affects the entire tenant (tenant deletion, pageserver shutdown),
|
||||
// or in one that affects the timeline only (timeline deletion).
|
||||
// Therefore, don't exit the loop.
|
||||
info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
} else {
|
||||
error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
}
|
||||
|
||||
sleep_duration = wait_duration;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
if first {
|
||||
first = false;
|
||||
if sleep_random(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
}
|
||||
|
||||
// We run ingest housekeeping with the same frequency as compaction: it is not worth
|
||||
// having a distinct setting. But we don't run it in the same task, because compaction
|
||||
// blocks on acquiring the background job semaphore.
|
||||
let period = tenant.get_compaction_period();
|
||||
|
||||
// If compaction period is set to zero (to disable it), then we will use a reasonable default
|
||||
let period = if period == Duration::ZERO {
|
||||
humantime::Duration::from_str(
|
||||
pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
|
||||
)
|
||||
.unwrap()
|
||||
.into()
|
||||
} else {
|
||||
period
|
||||
};
|
||||
|
||||
// Jitter the period by +/- 5%
|
||||
let period =
|
||||
rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
|
||||
|
||||
// Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
|
||||
// a tenant, since it won't have started writing any ephemeral files yet.
|
||||
if tokio::time::timeout(period, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO || gc_horizon == 0 {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::IngestHouseKeeping,
|
||||
kind: BackgroundLoopKind::Gc,
|
||||
};
|
||||
iteration.run(tenant.ingest_housekeeping()).await;
|
||||
|
||||
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
|
||||
// Or just spawn another background loop for this throttle, it's not like it's super costly.
|
||||
info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
// Run gc
|
||||
let IterationResult { output, elapsed: _ } = iteration
|
||||
.run(tenant.gc_iteration(
|
||||
None,
|
||||
gc_horizon,
|
||||
tenant.get_pitr_interval(),
|
||||
&cancel,
|
||||
&ctx,
|
||||
))
|
||||
.await;
|
||||
match output {
|
||||
Ok(_) => {
|
||||
error_run = 0;
|
||||
sleep_duration = period;
|
||||
}
|
||||
Err(crate::tenant::GcError::TenantCancelled) => {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.pagestream_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
count_accounted = count_accounted_finish, // don't break existing log scraping
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
count_accounted_start, // log after pre-existing fields to not break existing log scraping
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
Err(e) => {
|
||||
error_run += 1;
|
||||
let wait_duration =
|
||||
exponential_backoff_duration(error_run, 1.0, MAX_BACKOFF_SECS);
|
||||
|
||||
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
// if the tenant has a proper status already, no need to wait for anything
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
ControlFlow::Continue(())
|
||||
} else {
|
||||
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
match tenant_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = &*tenant_state_updates.borrow();
|
||||
match new_state {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => {
|
||||
debug!("Not running the task loop, tenant is not active: {state:?}");
|
||||
continue;
|
||||
}
|
||||
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
|
||||
// Timeline was cancelled during gc. We might either be in an event
|
||||
// that affects the entire tenant (tenant deletion, pageserver shutdown),
|
||||
// or in one that affects the timeline only (timeline deletion).
|
||||
// Therefore, don't exit the loop.
|
||||
info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
|
||||
} else {
|
||||
error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => {
|
||||
return ControlFlow::Break(());
|
||||
|
||||
sleep_duration = wait_duration;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tenant housekeeping's main loop.
|
||||
async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
loop {
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Use the same period as compaction; it's not worth a separate setting. But if it's set to
|
||||
// zero (to disable compaction), then use a reasonable default. Jitter it by 5%.
|
||||
let period = match tenant.get_compaction_period() {
|
||||
Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(),
|
||||
period => period,
|
||||
};
|
||||
|
||||
let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Do tenant housekeeping.
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::TenantHouseKeeping,
|
||||
};
|
||||
iteration.run(tenant.housekeeping()).await;
|
||||
|
||||
// Log any getpage throttling.
|
||||
info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.pagestream_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
count_accounted = count_accounted_finish, // don't break existing log scraping
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
count_accounted_start, // log after pre-existing fields to not break existing log scraping
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
|
||||
async fn wait_for_active_tenant(
|
||||
tenant: &Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
) -> ControlFlow<()> {
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
|
||||
let mut update_rx = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||
result = update_rx.changed() => if result.is_err() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
match &*update_rx.borrow() {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -529,26 +505,41 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
#[error("cancelled")]
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Provide a random delay for background task initialization.
|
||||
/// Sleeps for a random interval up to the given max value.
|
||||
///
|
||||
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
|
||||
/// different periods for more stable load.
|
||||
pub(crate) async fn random_init_delay(
|
||||
period: Duration,
|
||||
pub(crate) async fn sleep_random(
|
||||
max: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Cancelled> {
|
||||
if period == Duration::ZERO {
|
||||
return Ok(());
|
||||
}
|
||||
) -> Result<Duration, Cancelled> {
|
||||
sleep_random_range(Duration::ZERO..=max, cancel).await
|
||||
}
|
||||
|
||||
let d = {
|
||||
let mut rng = rand::thread_rng();
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
match tokio::time::timeout(d, cancel.cancelled()).await {
|
||||
Ok(_) => Err(Cancelled),
|
||||
Err(_) => Ok(()),
|
||||
/// Sleeps for a random interval in the given range. Returns the duration.
|
||||
pub(crate) async fn sleep_random_range(
|
||||
interval: RangeInclusive<Duration>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Duration, Cancelled> {
|
||||
let delay = rand::thread_rng().gen_range(interval);
|
||||
if delay == Duration::ZERO {
|
||||
return Ok(delay);
|
||||
}
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
_ = tokio::time::sleep(delay) => Ok(delay),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sleeps for an interval with a random jitter.
|
||||
pub(crate) async fn sleep_jitter(
|
||||
duration: Duration,
|
||||
jitter: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Duration, Cancelled> {
|
||||
let from = duration.saturating_sub(jitter);
|
||||
let to = duration.saturating_add(jitter);
|
||||
sleep_random_range(from..=to, cancel).await
|
||||
}
|
||||
|
||||
struct Iteration {
|
||||
@@ -564,42 +555,25 @@ struct IterationResult<O> {
|
||||
|
||||
impl Iteration {
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
let Self {
|
||||
started_at,
|
||||
period,
|
||||
kind,
|
||||
} = self;
|
||||
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
pub(crate) async fn run<F: Future<Output = O>, O>(self, fut: F) -> IterationResult<O> {
|
||||
let mut fut = pin!(fut);
|
||||
|
||||
// Wrap `fut` into a future that logs a message every `period` so that we get a
|
||||
// very obvious breadcrumb in the logs _while_ a slow iteration is happening.
|
||||
let liveness_logger = async move {
|
||||
loop {
|
||||
match tokio::time::timeout(period, &mut fut).await {
|
||||
Ok(x) => return x,
|
||||
Err(_) => {
|
||||
// info level as per the same rationale why warn_when_period_overrun is info
|
||||
// => https://github.com/neondatabase/neon/pull/5724
|
||||
info!("still running");
|
||||
}
|
||||
}
|
||||
let output = loop {
|
||||
match tokio::time::timeout(self.period, &mut fut).await {
|
||||
Ok(r) => break r,
|
||||
Err(_) => info!("still running"),
|
||||
}
|
||||
};
|
||||
|
||||
let output = liveness_logger.await;
|
||||
|
||||
let elapsed = started_at.elapsed();
|
||||
warn_when_period_overrun(elapsed, period, kind);
|
||||
let elapsed = self.started_at.elapsed();
|
||||
warn_when_period_overrun(elapsed, self.period, self.kind);
|
||||
|
||||
IterationResult { output, elapsed }
|
||||
}
|
||||
}
|
||||
/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
|
||||
|
||||
// NB: the `task` and `period` are used for metrics labels.
|
||||
pub(crate) fn warn_when_period_overrun(
|
||||
elapsed: Duration,
|
||||
period: Duration,
|
||||
@@ -617,7 +591,7 @@ pub(crate) fn warn_when_period_overrun(
|
||||
"task iteration took longer than the configured period"
|
||||
);
|
||||
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
|
||||
.with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
|
||||
.with_label_values(&[task.into(), &format!("{}", period.as_secs())])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,13 +45,12 @@ use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch},
|
||||
};
|
||||
use tokio::sync::{oneshot, watch, Notify};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::critical;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{
|
||||
fs_ext,
|
||||
@@ -192,7 +191,12 @@ pub enum ImageLayerCreationMode {
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub enum LastImageLayerCreationStatus {
|
||||
Incomplete, // TODO: record the last key being processed
|
||||
Incomplete {
|
||||
/// The last key of the partition (exclusive) that was processed in the last
|
||||
/// image layer creation attempt. We will continue from this key in the next
|
||||
/// attempt.
|
||||
last_key: Key,
|
||||
},
|
||||
Complete,
|
||||
#[default]
|
||||
Initial,
|
||||
@@ -221,6 +225,7 @@ pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -420,6 +425,9 @@ pub struct Timeline {
|
||||
/// If true, the last compaction failed.
|
||||
compaction_failed: AtomicBool,
|
||||
|
||||
/// Notifies the tenant compaction loop that there is pending L0 compaction work.
|
||||
l0_compaction_trigger: Arc<Notify>,
|
||||
|
||||
/// Make sure we only have one running gc at a time.
|
||||
///
|
||||
/// Must only be taken in two places:
|
||||
@@ -526,6 +534,9 @@ impl GcInfo {
|
||||
pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool {
|
||||
self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes)
|
||||
}
|
||||
pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool {
|
||||
self.leases.contains_key(&lsn)
|
||||
}
|
||||
}
|
||||
|
||||
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
|
||||
@@ -617,6 +628,71 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
|
||||
}
|
||||
}
|
||||
|
||||
/// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes
|
||||
/// only and not used by the "real read path".
|
||||
pub enum ReadPathLayerId {
|
||||
PersistentLayer(PersistentLayerKey),
|
||||
InMemoryLayer(Range<Lsn>),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ReadPathLayerId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key),
|
||||
ReadPathLayerId::InMemoryLayer(range) => {
|
||||
write!(f, "in-mem {}..{}", range.start, range.end)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub struct ReadPath {
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
path: Vec<(ReadPathLayerId, KeySpace, Range<Lsn>)>,
|
||||
}
|
||||
|
||||
impl ReadPath {
|
||||
pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self {
|
||||
Self {
|
||||
keyspace,
|
||||
lsn,
|
||||
path: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_layer_visit(
|
||||
&mut self,
|
||||
layer_to_read: &ReadableLayer,
|
||||
keyspace_to_read: &KeySpace,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) {
|
||||
let id = match layer_to_read {
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
ReadPathLayerId::PersistentLayer(layer.layer_desc().key())
|
||||
}
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
ReadPathLayerId::InMemoryLayer(layer.get_lsn_range())
|
||||
}
|
||||
};
|
||||
self.path
|
||||
.push((id, keyspace_to_read.clone(), lsn_range.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ReadPath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?;
|
||||
for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() {
|
||||
writeln!(
|
||||
f,
|
||||
"{}: {} {}..{} {}",
|
||||
idx, layer_id, lsn_range.start, lsn_range.end, keyspace
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
@@ -624,6 +700,8 @@ pub struct MissingKeyError {
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
/// Debug information about the read path if there's an error
|
||||
read_path: Option<ReadPath>,
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
@@ -640,10 +718,15 @@ impl std::fmt::Display for MissingKeyError {
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
|
||||
if let Some(ref read_path) = self.read_path {
|
||||
write!(f, "\n{}", read_path)?;
|
||||
}
|
||||
|
||||
if let Some(ref backtrace) = self.backtrace {
|
||||
write!(f, "\n{}", backtrace)?;
|
||||
}
|
||||
@@ -1060,6 +1143,7 @@ impl Timeline {
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
})),
|
||||
}
|
||||
}
|
||||
@@ -1186,6 +1270,13 @@ impl Timeline {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let read_path = if self.conf.enable_read_path_debugging {
|
||||
Some(ReadPath::new(keyspace.clone(), lsn))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let traversal_res: Result<(), _> = self
|
||||
.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
.await;
|
||||
@@ -1709,8 +1800,9 @@ impl Timeline {
|
||||
let prepare = async move {
|
||||
let guard = self.compaction_lock.lock().await;
|
||||
|
||||
let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
let permit = super::tasks::acquire_concurrency_permit(
|
||||
BackgroundLoopKind::Compaction,
|
||||
self.conf.use_compaction_semaphore,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
@@ -2574,6 +2666,7 @@ impl Timeline {
|
||||
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
compaction_failed: AtomicBool::default(),
|
||||
l0_compaction_trigger: resources.l0_compaction_trigger,
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
standby_horizon: AtomicLsn::new(0),
|
||||
@@ -2623,7 +2716,7 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
FlushLoopState::Exited => {
|
||||
warn!(
|
||||
info!(
|
||||
"ignoring attempt to restart exited flush_loop {}/{}",
|
||||
self.tenant_shard_id, self.timeline_id
|
||||
);
|
||||
@@ -3047,8 +3140,9 @@ impl Timeline {
|
||||
let self_ref = &self;
|
||||
let skip_concurrency_limiter = &skip_concurrency_limiter;
|
||||
async move {
|
||||
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
let wait_for_permit = super::tasks::acquire_concurrency_permit(
|
||||
BackgroundLoopKind::InitialLogicalSizeCalculation,
|
||||
false,
|
||||
background_ctx,
|
||||
);
|
||||
|
||||
@@ -3493,6 +3587,7 @@ impl Timeline {
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
backtrace: None,
|
||||
read_path: std::mem::take(&mut reconstruct_state.read_path),
|
||||
}));
|
||||
}
|
||||
|
||||
@@ -3611,6 +3706,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
if let Some(ref mut read_path) = reconstruct_state.read_path {
|
||||
read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
|
||||
}
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
@@ -3911,6 +4009,12 @@ impl Timeline {
|
||||
}
|
||||
let flush_duration = flush_timer.stop_and_record();
|
||||
|
||||
// Notify the tenant compaction loop if L0 compaction is needed.
|
||||
let l0_count = *watch_l0.borrow();
|
||||
if l0_count >= self.get_compaction_threshold() {
|
||||
self.l0_compaction_trigger.notify_one();
|
||||
}
|
||||
|
||||
// Delay the next flush to backpressure if compaction can't keep up. We delay by the
|
||||
// flush duration such that the flush takes 2x as long. This is propagated up to WAL
|
||||
// ingestion by having ephemeral layer rolls wait for flushes.
|
||||
@@ -4346,7 +4450,7 @@ impl Timeline {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
// Is it time to create a new image layer for the given partition? True if we want to generate.
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
@@ -4658,6 +4762,11 @@ impl Timeline {
|
||||
) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
if partitioning.parts.is_empty() {
|
||||
warn!("no partitions to create image layers for");
|
||||
return Ok((vec![], LastImageLayerCreationStatus::Complete));
|
||||
}
|
||||
|
||||
// We need to avoid holes between generated image layers.
|
||||
// Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
|
||||
// image layer with hole between them. In this case such layer can not be utilized by GC.
|
||||
@@ -4669,28 +4778,65 @@ impl Timeline {
|
||||
// image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.
|
||||
let mut start = Key::MIN;
|
||||
|
||||
let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
|
||||
info!(
|
||||
"resuming image layer creation: last_status={:?}",
|
||||
last_status
|
||||
);
|
||||
true
|
||||
} else {
|
||||
self.should_check_if_image_layers_required(lsn)
|
||||
};
|
||||
let check_for_image_layers =
|
||||
if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
|
||||
info!(
|
||||
"resuming image layer creation: last_status=incomplete, continue from {}",
|
||||
last_key
|
||||
);
|
||||
true
|
||||
} else {
|
||||
self.should_check_if_image_layers_required(lsn)
|
||||
};
|
||||
|
||||
let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
|
||||
|
||||
let mut all_generated = true;
|
||||
|
||||
let mut partition_processed = 0;
|
||||
let total_partitions = partitioning.parts.len();
|
||||
let mut total_partitions = partitioning.parts.len();
|
||||
let mut last_partition_processed = None;
|
||||
let mut partition_parts = partitioning.parts.clone();
|
||||
|
||||
for partition in partitioning.parts.iter() {
|
||||
if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
|
||||
// We need to skip the partitions that have already been processed.
|
||||
let mut found = false;
|
||||
for (i, partition) in partition_parts.iter().enumerate() {
|
||||
if last_key <= partition.end().unwrap() {
|
||||
// ```plain
|
||||
// |------|--------|----------|------|
|
||||
// ^last_key
|
||||
// ^start from this partition
|
||||
// ```
|
||||
// Why `i+1` instead of `i`?
|
||||
// It is possible that the user did some writes after the previous image layer creation attempt so that
|
||||
// a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
|
||||
// still want to skip this partition, so that we can make progress and avoid generating image layers over
|
||||
// the same partition. Doing a mod to ensure we don't end up with an empty vec.
|
||||
if i + 1 >= total_partitions {
|
||||
// In general, this case should not happen -- if last_key is on the last partition, the previous
|
||||
// iteration of image layer creation should return a complete status.
|
||||
break; // with found=false
|
||||
}
|
||||
partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
|
||||
total_partitions = partition_parts.len();
|
||||
// Update the start key to the partition start.
|
||||
start = partition_parts[0].start().unwrap();
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
// Last key is within the last partition, or larger than all partitions.
|
||||
return Ok((vec![], LastImageLayerCreationStatus::Complete));
|
||||
}
|
||||
}
|
||||
|
||||
for partition in partition_parts.iter() {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CreateImageLayersError::Cancelled);
|
||||
}
|
||||
|
||||
partition_processed += 1;
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
let compact_metadata = partition.overlaps(&Key::metadata_key_range());
|
||||
if compact_metadata {
|
||||
@@ -4725,6 +4871,8 @@ impl Timeline {
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
||||
is_delta: false,
|
||||
}) {
|
||||
// TODO: this can be processed with the BatchLayerWriter::finish_with_discard
|
||||
// in the future.
|
||||
tracing::info!(
|
||||
"Skipping image layer at {lsn} {}..{}, already exists",
|
||||
img_range.start,
|
||||
@@ -4805,8 +4953,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
partition_processed += 1;
|
||||
|
||||
if let ImageLayerCreationMode::Try = mode {
|
||||
// We have at least made some progress
|
||||
if batch_image_writer.pending_layer_num() >= 1 {
|
||||
@@ -4822,8 +4968,10 @@ impl Timeline {
|
||||
* self.get_compaction_threshold();
|
||||
if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
|
||||
tracing::info!(
|
||||
"preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
|
||||
"preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
|
||||
partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
|
||||
);
|
||||
last_partition_processed = Some(partition.clone());
|
||||
all_generated = false;
|
||||
break;
|
||||
}
|
||||
@@ -4868,7 +5016,14 @@ impl Timeline {
|
||||
if all_generated {
|
||||
LastImageLayerCreationStatus::Complete
|
||||
} else {
|
||||
LastImageLayerCreationStatus::Incomplete
|
||||
LastImageLayerCreationStatus::Incomplete {
|
||||
last_key: if let Some(last_partition_processed) = last_partition_processed {
|
||||
last_partition_processed.end().unwrap_or(Key::MIN)
|
||||
} else {
|
||||
// This branch should be unreachable, but in case it happens, we can just return the start key.
|
||||
Key::MIN
|
||||
},
|
||||
}
|
||||
},
|
||||
))
|
||||
}
|
||||
@@ -5748,10 +5903,11 @@ impl Timeline {
|
||||
let img = match res {
|
||||
Ok(img) => img,
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(e)) => {
|
||||
Err(walredo::Error::Other(err)) => {
|
||||
critical!("walredo failure during page reconstruction: {err:?}");
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
e.context("reconstruct a page image"),
|
||||
))
|
||||
err.context("reconstruct a page image"),
|
||||
));
|
||||
}
|
||||
};
|
||||
Ok(img)
|
||||
|
||||
@@ -10,8 +10,8 @@ use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
LastImageLayerCreationStatus, RecordedDuration, Timeline,
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
|
||||
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
@@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::critical;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
@@ -33,6 +34,7 @@ use crate::page_cache;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::batch_split_writer::{
|
||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
||||
@@ -438,6 +440,11 @@ impl KeyHistoryRetention {
|
||||
if dry_run {
|
||||
return true;
|
||||
}
|
||||
if LayerMap::is_l0(&key.key_range, key.is_delta) {
|
||||
// gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
|
||||
// We should ignore such layers.
|
||||
return true;
|
||||
}
|
||||
let layer_generation;
|
||||
{
|
||||
let guard = tline.layers.read().await;
|
||||
@@ -680,6 +687,20 @@ impl Timeline {
|
||||
|
||||
// Define partitioning schema if needed
|
||||
|
||||
let l0_l1_boundary_lsn = {
|
||||
// We do the repartition on the L0-L1 boundary. All data below the boundary
|
||||
// are compacted by L0 with low read amplification, thus making the `repartition`
|
||||
// function run fast.
|
||||
let guard = self.layers.read().await;
|
||||
let l0_min_lsn = guard
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
.iter()
|
||||
.map(|l| l.get_lsn_range().start)
|
||||
.min()
|
||||
.unwrap_or(self.get_disk_consistent_lsn());
|
||||
l0_min_lsn.max(self.get_ancestor_lsn())
|
||||
};
|
||||
// 1. L0 Compact
|
||||
let l0_compaction_outcome = {
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
@@ -702,72 +723,87 @@ impl Timeline {
|
||||
return Ok(CompactionOutcome::Pending);
|
||||
}
|
||||
|
||||
// 2. Repartition and create image layers if necessary
|
||||
let partition_count = match self
|
||||
.repartition(
|
||||
self.get_last_record_lsn(), // TODO: use L0-L1 boundary
|
||||
self.get_compaction_target_size(),
|
||||
options.flags,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
if l0_l1_boundary_lsn < self.partitioning.read().1 {
|
||||
// We never go backwards when repartition and create image layers.
|
||||
info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN.");
|
||||
} else {
|
||||
// 2. Repartition and create image layers if necessary
|
||||
match self
|
||||
.repartition(
|
||||
l0_l1_boundary_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
options.flags,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
.parts
|
||||
.extend(sparse_partitioning.into_dense().parts);
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
.parts
|
||||
.extend(sparse_partitioning.into_dense().parts);
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified "enough".
|
||||
let (image_layers, outcome) = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
if options
|
||||
.flags
|
||||
.contains(CompactFlags::ForceImageLayerCreation)
|
||||
{
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
self.last_image_layer_creation_status
|
||||
.load()
|
||||
.as_ref()
|
||||
.clone(),
|
||||
)
|
||||
.await?;
|
||||
// 3. Create new image layers for partitions that have been modified "enough".
|
||||
let (image_layers, outcome) = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
if options
|
||||
.flags
|
||||
.contains(CompactFlags::ForceImageLayerCreation)
|
||||
{
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
self.last_image_layer_creation_status
|
||||
.load()
|
||||
.as_ref()
|
||||
.clone(),
|
||||
)
|
||||
.await
|
||||
.inspect_err(|err| {
|
||||
if let CreateImageLayersError::GetVectoredError(
|
||||
GetVectoredError::MissingKey(_),
|
||||
) = err
|
||||
{
|
||||
critical!("missing key during compaction: {err:?}");
|
||||
}
|
||||
})?;
|
||||
|
||||
self.last_image_layer_creation_status
|
||||
.store(Arc::new(outcome.clone()));
|
||||
self.last_image_layer_creation_status
|
||||
.store(Arc::new(outcome.clone()));
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
if let LastImageLayerCreationStatus::Incomplete = outcome {
|
||||
// Yield and do not do any other kind of compaction.
|
||||
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
|
||||
return Ok(CompactionOutcome::Pending);
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
|
||||
// Yield and do not do any other kind of compaction.
|
||||
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
|
||||
return Ok(CompactionOutcome::Pending);
|
||||
}
|
||||
}
|
||||
partitioning.parts.len()
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||
// error but continue.
|
||||
//
|
||||
// Suppress error when it's due to cancellation
|
||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||
// error but continue.
|
||||
//
|
||||
// Suppress error when it's due to cancellation
|
||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||
tracing::error!(
|
||||
"could not compact, repartitioning keyspace failed: {err:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
1
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
let partition_count = self.partitioning.read().0 .0.parts.len();
|
||||
|
||||
// 4. Shard ancestor compaction
|
||||
|
||||
@@ -2223,8 +2259,11 @@ impl Timeline {
|
||||
split_key_ranges.push((start, end));
|
||||
}
|
||||
split_key_ranges.sort();
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let all_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
let mut current_start = None;
|
||||
let ranges_num = split_key_ranges.len();
|
||||
for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
|
||||
@@ -2236,14 +2275,23 @@ impl Timeline {
|
||||
// We have already processed this partition.
|
||||
continue;
|
||||
}
|
||||
let res = layer_map.range_search(start..end, compact_below_lsn);
|
||||
let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
|
||||
let overlapping_layers = {
|
||||
let mut desc = Vec::new();
|
||||
for layer in all_layers.iter() {
|
||||
if overlaps_with(&layer.get_key_range(), &(start..end))
|
||||
&& layer.get_lsn_range().start <= compact_below_lsn
|
||||
{
|
||||
desc.push(layer.clone());
|
||||
}
|
||||
}
|
||||
desc
|
||||
};
|
||||
let total_size = overlapping_layers.iter().map(|x| x.file_size).sum::<u64>();
|
||||
if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
|
||||
// Try to extend the compaction range so that we include at least one full layer file.
|
||||
let extended_end = res
|
||||
.found
|
||||
.keys()
|
||||
.map(|layer| layer.layer.key_range.end)
|
||||
let extended_end = overlapping_layers
|
||||
.iter()
|
||||
.map(|layer| layer.key_range.end)
|
||||
.min();
|
||||
// It is possible that the search range does not contain any layer files when we reach the end of the loop.
|
||||
// In this case, we simply use the specified key range end.
|
||||
@@ -2270,7 +2318,6 @@ impl Timeline {
|
||||
current_start = Some(end);
|
||||
}
|
||||
}
|
||||
drop(guard);
|
||||
Ok(compact_jobs)
|
||||
}
|
||||
|
||||
|
||||
@@ -17,13 +17,11 @@ use crate::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
|
||||
TenantManifestError, TimelineOrOffloaded,
|
||||
TenantManifestError, Timeline, TimelineOrOffloaded,
|
||||
},
|
||||
virtual_file::MaybeFatalIo,
|
||||
};
|
||||
|
||||
use super::{Timeline, TimelineResources};
|
||||
|
||||
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
/// during attach or pageserver restart.
|
||||
/// See comment in persist_index_part_with_deleted_flag.
|
||||
@@ -296,12 +294,7 @@ impl DeleteTimelineFlow {
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
None, // Ancestor is not needed for deletion.
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: tenant.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
|
||||
},
|
||||
tenant.get_timeline_resources_for(remote_client),
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
@@ -341,6 +334,13 @@ impl DeleteTimelineFlow {
|
||||
let tenant_shard_id = timeline.tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id();
|
||||
|
||||
// Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest.
|
||||
let Ok(tenant_guard) = tenant.gate.enter() else {
|
||||
// It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion.
|
||||
info!("Tenant is shutting down, timeline deletion will be resumed when it next starts");
|
||||
return;
|
||||
};
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
@@ -348,6 +348,8 @@ impl DeleteTimelineFlow {
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
async move {
|
||||
let _guard = tenant_guard;
|
||||
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
|
||||
// Only log as an error if it's not a cancellation.
|
||||
if matches!(err, DeleteTimelineError::Cancelled) {
|
||||
|
||||
@@ -30,8 +30,11 @@ use crate::{
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
|
||||
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
|
||||
size::CalculateSyntheticSizeError,
|
||||
storage_layer::LayerVisibilityHint,
|
||||
tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit},
|
||||
timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -80,8 +83,6 @@ impl Timeline {
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
|
||||
// acquire the gate guard only once within a useful span
|
||||
let Ok(guard) = self.gate.enter() else {
|
||||
return;
|
||||
@@ -94,7 +95,7 @@ impl Timeline {
|
||||
EvictionPolicy::OnlyImitiate(lat) => lat.period,
|
||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
||||
};
|
||||
if random_init_delay(period, &self.cancel).await.is_err() {
|
||||
if sleep_random(period, &self.cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -330,9 +331,10 @@ impl Timeline {
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
|
||||
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
|
||||
let acquire_permit = crate::tenant::tasks::acquire_concurrency_permit(
|
||||
BackgroundLoopKind::Eviction,
|
||||
false,
|
||||
ctx,
|
||||
);
|
||||
|
||||
@@ -374,7 +376,7 @@ impl Timeline {
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
permit: tokio::sync::SemaphorePermit<'static>,
|
||||
permit: BackgroundLoopSemaphorePermit<'static>,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
|
||||
@@ -39,7 +39,7 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
|
||||
use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
|
||||
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||
|
||||
/// Status of the connection.
|
||||
@@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// advances it to its end LSN. 0 is just an initialization placeholder.
|
||||
let mut modification = timeline.begin_modification(Lsn(0));
|
||||
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
uncommitted: &mut u64,
|
||||
) -> anyhow::Result<()> {
|
||||
let stats = modification.stats();
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST.records_committed.inc_by(*uncommitted);
|
||||
WAL_INGEST.inc_values_committed(&stats);
|
||||
*uncommitted = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
if !records.is_empty() {
|
||||
timeline
|
||||
.metrics
|
||||
@@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
|
||||
}
|
||||
|
||||
let local_next_record_lsn = interpreted.next_record_lsn;
|
||||
@@ -381,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {local_next_record_lsn}")
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
|
||||
uncommitted_records += 1;
|
||||
@@ -396,8 +415,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -415,7 +433,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
if uncommitted_records > 0 || needs_last_record_lsn_advance {
|
||||
// Commit any uncommitted records
|
||||
modification.commit(&ctx).await?;
|
||||
commit(&mut modification, &ctx, &mut uncommitted_records).await?;
|
||||
}
|
||||
|
||||
if !caught_up && streaming_lsn >= end_of_wal {
|
||||
@@ -442,10 +460,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let stats = modification.stats();
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST.inc_values_committed(&stats);
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
@@ -507,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {next_record_lsn}")
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
if !ingested {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
|
||||
|
||||
@@ -28,17 +28,9 @@ use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::fsm_logical_to_physical;
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
|
||||
use wal_decoder::models::*;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use tracing::*;
|
||||
use utils::failpoint_support;
|
||||
use utils::rate_limit::RateLimit;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
@@ -50,11 +42,18 @@ use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::fsm_logical_to_physical;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::TransactionId;
|
||||
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{critical, failpoint_support};
|
||||
use wal_decoder::models::*;
|
||||
|
||||
enum_pgversion! {CheckPoint, pgv::CheckPoint}
|
||||
|
||||
@@ -327,93 +326,75 @@ impl WalIngest {
|
||||
let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
|
||||
let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
|
||||
|
||||
// Sometimes, Postgres seems to create heap WAL records with the
|
||||
// ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
|
||||
// not set. In fact, it's possible that the VM page does not exist at all.
|
||||
// In that case, we don't want to store a record to clear the VM bit;
|
||||
// replaying it would fail to find the previous image of the page, because
|
||||
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
||||
// record if it doesn't.
|
||||
//
|
||||
// TODO: analyze the metrics and tighten this up accordingly. This logic
|
||||
// implicitly assumes that VM pages see explicit WAL writes before
|
||||
// implicit ClearVmBits, and will otherwise silently drop updates.
|
||||
// VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
|
||||
// its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
|
||||
// as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
|
||||
// https://github.com/neondatabase/neon/pull/10634.
|
||||
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["relation"])
|
||||
.inc();
|
||||
critical!("clear_vm_bits for unknown VM relation {vm_rel}");
|
||||
return Ok(());
|
||||
};
|
||||
if let Some(blknum) = new_vm_blk {
|
||||
if blknum >= vm_size {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["new_page"])
|
||||
.inc();
|
||||
critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
|
||||
new_vm_blk = None;
|
||||
}
|
||||
}
|
||||
if let Some(blknum) = old_vm_blk {
|
||||
if blknum >= vm_size {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["old_page"])
|
||||
.inc();
|
||||
critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
|
||||
old_vm_blk = None;
|
||||
}
|
||||
}
|
||||
|
||||
if new_vm_blk.is_some() || old_vm_blk.is_some() {
|
||||
if new_vm_blk == old_vm_blk {
|
||||
// An UPDATE record that needs to clear the bits for both old and the
|
||||
// new page, both of which reside on the same VM page.
|
||||
if new_vm_blk.is_none() && old_vm_blk.is_none() {
|
||||
return Ok(());
|
||||
} else if new_vm_blk == old_vm_blk {
|
||||
// An UPDATE record that needs to clear the bits for both old and the new page, both of
|
||||
// which reside on the same VM page.
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
new_vm_blk.unwrap(),
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
// Clear VM bits for one heap page, or for two pages that reside on different VM pages.
|
||||
if let Some(new_vm_blk) = new_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
new_vm_blk.unwrap(),
|
||||
new_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno: None,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
if let Some(old_vm_blk) = old_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
old_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno: None,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
// Clear VM bits for one heap page, or for two pages that reside on
|
||||
// different VM pages.
|
||||
if let Some(new_vm_blk) = new_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
new_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno: None,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
if let Some(old_vm_blk) = old_vm_blk {
|
||||
self.put_rel_wal_record(
|
||||
modification,
|
||||
vm_rel,
|
||||
old_vm_blk,
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno: None,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -79,6 +79,14 @@ impl WalRedoProcess {
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env(
|
||||
"ASAN_OPTIONS",
|
||||
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
|
||||
)
|
||||
.env(
|
||||
"UBSAN_OPTIONS",
|
||||
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
|
||||
)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
|
||||
@@ -509,47 +509,44 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
|
||||
tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
|
||||
chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
|
||||
if (!LFC_ENABLED())
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
return 0;
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
|
||||
if (entry != NULL)
|
||||
if (entry != NULL)
|
||||
{
|
||||
for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
|
||||
{
|
||||
for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
|
||||
if ((entry->bitmap[chunk_offs >> 5] &
|
||||
((uint32)1 << (chunk_offs & 31))) != 0)
|
||||
{
|
||||
if ((entry->bitmap[chunk_offs >> 5] &
|
||||
((uint32)1 << (chunk_offs & 31))) != 0)
|
||||
{
|
||||
BITMAP_SET(bitmap, i);
|
||||
found++;
|
||||
}
|
||||
BITMAP_SET(bitmap, i);
|
||||
found++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
i += this_chunk;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
return found;
|
||||
i += this_chunk;
|
||||
}
|
||||
|
||||
/*
|
||||
* Break out of the iteration before doing expensive stuff for
|
||||
* a next iteration
|
||||
*/
|
||||
if (i + 1 >= nblocks)
|
||||
if (i >= nblocks)
|
||||
break;
|
||||
|
||||
/*
|
||||
@@ -563,8 +560,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
#if USE_ASSERT_CHECKING
|
||||
do {
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
{
|
||||
int count = 0;
|
||||
|
||||
for (int j = 0; j < nblocks; j++)
|
||||
@@ -574,7 +571,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
|
||||
Assert(count == found);
|
||||
} while (false);
|
||||
}
|
||||
#endif
|
||||
|
||||
return found;
|
||||
|
||||
@@ -36,6 +36,11 @@
|
||||
#include "pagestore_client.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/ioctl.h>
|
||||
#include <linux/sockios.h>
|
||||
#endif
|
||||
|
||||
#define PageStoreTrace DEBUG5
|
||||
|
||||
#define MIN_RECONNECT_INTERVAL_USEC 1000
|
||||
@@ -728,11 +733,36 @@ retry:
|
||||
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
|
||||
if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
|
||||
{
|
||||
int sndbuf = -1;
|
||||
int recvbuf = -1;
|
||||
#ifdef __linux__
|
||||
int socketfd;
|
||||
#endif
|
||||
|
||||
since_start = now;
|
||||
INSTR_TIME_SUBTRACT(since_start, start_ts);
|
||||
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
|
||||
|
||||
#ifdef __linux__
|
||||
/*
|
||||
* get kernel's send and recv queue size via ioctl
|
||||
* https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
|
||||
*/
|
||||
socketfd = PQsocket(pageserver_conn);
|
||||
if (socketfd != -1) {
|
||||
int ioctl_err;
|
||||
ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
|
||||
if (ioctl_err!= 0) {
|
||||
sndbuf = -errno;
|
||||
}
|
||||
ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
|
||||
if (ioctl_err != 0) {
|
||||
recvbuf = -errno;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
|
||||
INSTR_TIME_GET_DOUBLE(since_start),
|
||||
shard->nrequests_sent, shard->nresponses_received);
|
||||
shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
|
||||
last_log_ts = now;
|
||||
logged = true;
|
||||
}
|
||||
|
||||
@@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
{
|
||||
uint64 min_ring_index;
|
||||
PrefetchRequest hashkey;
|
||||
#if USE_ASSERT_CHECKING
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
bool any_hits = false;
|
||||
#endif
|
||||
/* We will never read further ahead than our buffer can store. */
|
||||
@@ -955,7 +955,7 @@ Retry:
|
||||
else
|
||||
lsns = NULL;
|
||||
|
||||
#if USE_ASSERT_CHECKING
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
any_hits = true;
|
||||
#endif
|
||||
|
||||
@@ -3011,7 +3011,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
|
||||
start_ts = GetCurrentTimestamp();
|
||||
|
||||
if (RecoveryInProgress() && MyBackendType != B_STARTUP)
|
||||
XLogWaitForReplayOf(reqlsns[0].request_lsn);
|
||||
XLogWaitForReplayOf(reqlsns->request_lsn);
|
||||
|
||||
/*
|
||||
* Try to find prefetched page in the list of received pages.
|
||||
|
||||
@@ -19,6 +19,7 @@ aws-config.workspace = true
|
||||
aws-sdk-iam.workspace = true
|
||||
aws-sigv4.workspace = true
|
||||
base64.workspace = true
|
||||
boxcar = "0.2.8"
|
||||
bstr.workspace = true
|
||||
bytes = { workspace = true, features = ["serde"] }
|
||||
camino.workspace = true
|
||||
@@ -42,6 +43,7 @@ hyper0.workspace = true
|
||||
hyper = { workspace = true, features = ["server", "http1", "http2"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
http-body-util = { version = "0.1" }
|
||||
gettid = "0.1.3"
|
||||
indexmap = { workspace = true, features = ["serde"] }
|
||||
ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
@@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] }
|
||||
measured = { workspace = true, features = ["lasso"] }
|
||||
metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry = { workspace = true, features = ["trace"] }
|
||||
papaya = "0.1.8"
|
||||
parking_lot.workspace = true
|
||||
parquet.workspace = true
|
||||
parquet_derive.workspace = true
|
||||
@@ -89,6 +93,9 @@ tokio = { workspace = true, features = ["signal"] }
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-log.workspace = true
|
||||
tracing-serde.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
try-lock.workspace = true
|
||||
typed-json.workspace = true
|
||||
url.workspace = true
|
||||
@@ -112,6 +119,7 @@ rsa = "0.9"
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
assert-json-diff.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
flate2.workspace = true
|
||||
|
||||
@@ -37,8 +37,8 @@ To play with it locally one may start proxy over a local postgres installation
|
||||
|
||||
If both postgres and proxy are running you may send a SQL query:
|
||||
```console
|
||||
curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
|
||||
-H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
|
||||
curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \
|
||||
-H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
|
||||
@@ -104,7 +104,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie
|
||||
|
||||
## Test proxy locally
|
||||
|
||||
Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
|
||||
Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`.
|
||||
|
||||
We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
|
||||
```sh
|
||||
@@ -125,7 +125,7 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPER
|
||||
|
||||
Let's create self-signed certificate by running:
|
||||
```sh
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
|
||||
```
|
||||
|
||||
Then we need to build proxy with 'testing' feature and run, e.g.:
|
||||
@@ -136,5 +136,5 @@ RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backe
|
||||
Now from client you can start a new session:
|
||||
|
||||
```sh
|
||||
PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
|
||||
PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full"
|
||||
```
|
||||
|
||||
@@ -108,6 +108,10 @@ impl<T> Backend<'_, T> {
|
||||
Self::Local(_) => panic!("Local backend has no API"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_local_proxy(&self) -> bool {
|
||||
matches!(self, Self::Local(_))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Backend<'a, T> {
|
||||
|
||||
@@ -69,17 +69,35 @@ pub async fn handle_cancel_messages(
|
||||
value,
|
||||
resp_tx,
|
||||
_guard,
|
||||
expire: _,
|
||||
expire,
|
||||
} => {
|
||||
let res = client.hset(&key, field, value).await;
|
||||
if let Some(resp_tx) = resp_tx {
|
||||
resp_tx
|
||||
.send(client.hset(key, field, value).await)
|
||||
.inspect_err(|e| {
|
||||
tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
|
||||
})
|
||||
.ok();
|
||||
if res.is_ok() {
|
||||
resp_tx
|
||||
.send(client.expire(key, expire).await)
|
||||
.inspect_err(|e| {
|
||||
tracing::debug!(
|
||||
"failed to send StoreCancelKey response: {:?}",
|
||||
e
|
||||
);
|
||||
})
|
||||
.ok();
|
||||
} else {
|
||||
resp_tx
|
||||
.send(res)
|
||||
.inspect_err(|e| {
|
||||
tracing::debug!(
|
||||
"failed to send StoreCancelKey response: {:?}",
|
||||
e
|
||||
);
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
} else if res.is_ok() {
|
||||
drop(client.expire(key, expire).await);
|
||||
} else {
|
||||
drop(client.hset(key, field, value).await);
|
||||
tracing::warn!("failed to store cancel key: {:?}", res);
|
||||
}
|
||||
}
|
||||
CancelKeyOp::GetCancelData {
|
||||
@@ -436,7 +454,7 @@ impl Session {
|
||||
&self.key
|
||||
}
|
||||
|
||||
// Send the store key op to the cancellation handler
|
||||
// Send the store key op to the cancellation handler and set TTL for the key
|
||||
pub(crate) async fn write_cancel_key(
|
||||
&self,
|
||||
cancel_closure: CancelClosure,
|
||||
|
||||
@@ -1,10 +1,23 @@
|
||||
use tracing::Subscriber;
|
||||
use std::cell::{Cell, RefCell};
|
||||
use std::collections::HashMap;
|
||||
use std::hash::BuildHasher;
|
||||
use std::{env, io};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use opentelemetry::trace::TraceContextExt;
|
||||
use scopeguard::defer;
|
||||
use serde::ser::{SerializeMap, Serializer};
|
||||
use tracing::span;
|
||||
use tracing::subscriber::Interest;
|
||||
use tracing::{callsite, Event, Metadata, Span, Subscriber};
|
||||
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
||||
use tracing_subscriber::filter::{EnvFilter, LevelFilter};
|
||||
use tracing_subscriber::fmt::format::{Format, Full};
|
||||
use tracing_subscriber::fmt::time::SystemTime;
|
||||
use tracing_subscriber::fmt::{FormatEvent, FormatFields};
|
||||
use tracing_subscriber::layer::{Context, Layer};
|
||||
use tracing_subscriber::prelude::*;
|
||||
use tracing_subscriber::registry::LookupSpan;
|
||||
use tracing_subscriber::registry::{LookupSpan, SpanRef};
|
||||
|
||||
/// Initialize logging and OpenTelemetry tracing and exporter.
|
||||
///
|
||||
@@ -15,6 +28,8 @@ use tracing_subscriber::registry::LookupSpan;
|
||||
/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
|
||||
/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
|
||||
pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
let logfmt = LogFormat::from_env()?;
|
||||
|
||||
let env_filter = EnvFilter::builder()
|
||||
.with_default_directive(LevelFilter::INFO.into())
|
||||
.from_env_lossy()
|
||||
@@ -29,17 +44,36 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
.expect("this should be a valid filter directive"),
|
||||
);
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stderr)
|
||||
.with_target(false);
|
||||
|
||||
let otlp_layer = tracing_utils::init_tracing("proxy").await;
|
||||
|
||||
let json_log_layer = if logfmt == LogFormat::Json {
|
||||
Some(JsonLoggingLayer {
|
||||
clock: RealClock,
|
||||
skipped_field_indices: papaya::HashMap::default(),
|
||||
writer: StderrWriter {
|
||||
stderr: std::io::stderr(),
|
||||
},
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let text_log_layer = if logfmt == LogFormat::Text {
|
||||
Some(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stderr)
|
||||
.with_target(false),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(otlp_layer)
|
||||
.with(fmt_layer)
|
||||
.with(json_log_layer)
|
||||
.with(text_log_layer)
|
||||
.try_init()?;
|
||||
|
||||
Ok(LoggingGuard)
|
||||
@@ -94,3 +128,857 @@ impl Drop for LoggingGuard {
|
||||
tracing_utils::shutdown_tracing();
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: make JSON the default
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
|
||||
enum LogFormat {
|
||||
#[default]
|
||||
Text = 1,
|
||||
Json,
|
||||
}
|
||||
|
||||
impl LogFormat {
|
||||
fn from_env() -> anyhow::Result<Self> {
|
||||
let logfmt = env::var("LOGFMT");
|
||||
Ok(match logfmt.as_deref() {
|
||||
Err(_) => LogFormat::default(),
|
||||
Ok("text") => LogFormat::Text,
|
||||
Ok("json") => LogFormat::Json,
|
||||
Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait MakeWriter {
|
||||
fn make_writer(&self) -> impl io::Write;
|
||||
}
|
||||
|
||||
struct StderrWriter {
|
||||
stderr: io::Stderr,
|
||||
}
|
||||
|
||||
impl MakeWriter for StderrWriter {
|
||||
#[inline]
|
||||
fn make_writer(&self) -> impl io::Write {
|
||||
self.stderr.lock()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: move into separate module or even separate crate.
|
||||
trait Clock {
|
||||
fn now(&self) -> DateTime<Utc>;
|
||||
}
|
||||
|
||||
struct RealClock;
|
||||
|
||||
impl Clock for RealClock {
|
||||
#[inline]
|
||||
fn now(&self) -> DateTime<Utc> {
|
||||
Utc::now()
|
||||
}
|
||||
}
|
||||
|
||||
/// Name of the field used by tracing crate to store the event message.
|
||||
const MESSAGE_FIELD: &str = "message";
|
||||
|
||||
thread_local! {
|
||||
/// Protects against deadlocks and double panics during log writing.
|
||||
/// The current panic handler will use tracing to log panic information.
|
||||
static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
|
||||
/// Thread-local instance with per-thread buffer for log writing.
|
||||
static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
|
||||
/// Cached OS thread ID.
|
||||
static THREAD_ID: u64 = gettid::gettid();
|
||||
}
|
||||
|
||||
/// Implements tracing layer to handle events specific to logging.
|
||||
struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
|
||||
clock: C,
|
||||
skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
|
||||
writer: W,
|
||||
}
|
||||
|
||||
impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
{
|
||||
fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) {
|
||||
use std::io::Write;
|
||||
|
||||
// TODO: consider special tracing subscriber to grab timestamp very
|
||||
// early, before OTel machinery, and add as event extension.
|
||||
let now = self.clock.now();
|
||||
|
||||
let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
|
||||
if entered.get() {
|
||||
let mut formatter = EventFormatter::new();
|
||||
formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
|
||||
self.writer.make_writer().write_all(formatter.buffer())
|
||||
} else {
|
||||
entered.set(true);
|
||||
defer!(entered.set(false););
|
||||
|
||||
EVENT_FORMATTER.with_borrow_mut(move |formatter| {
|
||||
formatter.reset();
|
||||
formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
|
||||
self.writer.make_writer().write_all(formatter.buffer())
|
||||
})
|
||||
}
|
||||
});
|
||||
|
||||
// In case logging fails we generate a simpler JSON object.
|
||||
if let Err(err) = res {
|
||||
if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
|
||||
"timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
|
||||
"level": "ERROR",
|
||||
"message": format_args!("cannot log event: {err:?}"),
|
||||
"fields": {
|
||||
"event": format_args!("{event:?}"),
|
||||
},
|
||||
})) {
|
||||
line.push(b'\n');
|
||||
self.writer.make_writer().write_all(&line).ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a SpanFields instance as span extension.
|
||||
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
|
||||
let span = ctx.span(id).expect("span must exist");
|
||||
let fields = SpanFields::default();
|
||||
fields.record_fields(attrs);
|
||||
// This could deadlock when there's a panic somewhere in the tracing
|
||||
// event handling and a read or write guard is still held. This includes
|
||||
// the OTel subscriber.
|
||||
span.extensions_mut().insert(fields);
|
||||
}
|
||||
|
||||
fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
|
||||
let span = ctx.span(id).expect("span must exist");
|
||||
let ext = span.extensions();
|
||||
if let Some(data) = ext.get::<SpanFields>() {
|
||||
data.record_fields(values);
|
||||
}
|
||||
}
|
||||
|
||||
/// Called (lazily) whenever a new log call is executed. We quickly check
|
||||
/// for duplicate field names and record duplicates as skippable. Last one
|
||||
/// wins.
|
||||
fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
|
||||
if !metadata.is_event() {
|
||||
// Must not be never because we wouldn't get trace and span data.
|
||||
return Interest::always();
|
||||
}
|
||||
|
||||
let mut field_indices = SkippedFieldIndices::default();
|
||||
let mut seen_fields = HashMap::<&'static str, usize>::new();
|
||||
for field in metadata.fields() {
|
||||
use std::collections::hash_map::Entry;
|
||||
match seen_fields.entry(field.name()) {
|
||||
Entry::Vacant(entry) => {
|
||||
// field not seen yet
|
||||
entry.insert(field.index());
|
||||
}
|
||||
Entry::Occupied(mut entry) => {
|
||||
// replace currently stored index
|
||||
let old_index = entry.insert(field.index());
|
||||
// ... and append it to list of skippable indices
|
||||
field_indices.push(old_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !field_indices.is_empty() {
|
||||
self.skipped_field_indices
|
||||
.pin()
|
||||
.insert(metadata.callsite(), field_indices);
|
||||
}
|
||||
|
||||
Interest::always()
|
||||
}
|
||||
}
|
||||
|
||||
/// Stores span field values recorded during the spans lifetime.
|
||||
#[derive(Default)]
|
||||
struct SpanFields {
|
||||
// TODO: Switch to custom enum with lasso::Spur for Strings?
|
||||
fields: papaya::HashMap<&'static str, serde_json::Value>,
|
||||
}
|
||||
|
||||
impl SpanFields {
|
||||
#[inline]
|
||||
fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
|
||||
fields.record(&mut SpanFieldsRecorder {
|
||||
fields: self.fields.pin(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements a tracing field visitor to convert and store values.
|
||||
struct SpanFieldsRecorder<'m, S, G> {
|
||||
fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
|
||||
}
|
||||
|
||||
impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
|
||||
if let Ok(value) = i64::try_from(value) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
} else {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
|
||||
if let Ok(value) = u64::try_from(value) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
} else {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value:?}")));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_error(
|
||||
&mut self,
|
||||
field: &tracing::field::Field,
|
||||
value: &(dyn std::error::Error + 'static),
|
||||
) {
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
/// List of field indices skipped during logging. Can list duplicate fields or
|
||||
/// metafields not meant to be logged.
|
||||
#[derive(Clone, Default)]
|
||||
struct SkippedFieldIndices {
|
||||
bits: u64,
|
||||
}
|
||||
|
||||
impl SkippedFieldIndices {
|
||||
#[inline]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.bits == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn push(&mut self, index: usize) {
|
||||
self.bits |= 1u64
|
||||
.checked_shl(index as u32)
|
||||
.expect("field index too large");
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn contains(&self, index: usize) -> bool {
|
||||
self.bits
|
||||
& 1u64
|
||||
.checked_shl(index as u32)
|
||||
.expect("field index too large")
|
||||
!= 0
|
||||
}
|
||||
}
|
||||
|
||||
/// Formats a tracing event and writes JSON to its internal buffer including a newline.
|
||||
// TODO: buffer capacity management, truncate if too large
|
||||
struct EventFormatter {
|
||||
logline_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl EventFormatter {
|
||||
#[inline]
|
||||
fn new() -> Self {
|
||||
EventFormatter {
|
||||
logline_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn buffer(&self) -> &[u8] {
|
||||
&self.logline_buffer
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn reset(&mut self) {
|
||||
self.logline_buffer.clear();
|
||||
}
|
||||
|
||||
fn format<S>(
|
||||
&mut self,
|
||||
now: DateTime<Utc>,
|
||||
event: &Event<'_>,
|
||||
ctx: &Context<'_, S>,
|
||||
skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
{
|
||||
let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
|
||||
|
||||
use tracing_log::NormalizeEvent;
|
||||
let normalized_meta = event.normalized_metadata();
|
||||
let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
|
||||
|
||||
let skipped_field_indices = skipped_field_indices.pin();
|
||||
let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
|
||||
|
||||
let mut serialize = || {
|
||||
let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
|
||||
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
|
||||
// Timestamp comes first, so raw lines can be sorted by timestamp.
|
||||
serializer.serialize_entry("timestamp", ×tamp)?;
|
||||
|
||||
// Level next.
|
||||
serializer.serialize_entry("level", &meta.level().as_str())?;
|
||||
|
||||
// Message next.
|
||||
serializer.serialize_key("message")?;
|
||||
let mut message_extractor =
|
||||
MessageFieldExtractor::new(serializer, skipped_field_indices);
|
||||
event.record(&mut message_extractor);
|
||||
let mut serializer = message_extractor.into_serializer()?;
|
||||
|
||||
let mut fields_present = FieldsPresent(false, skipped_field_indices);
|
||||
event.record(&mut fields_present);
|
||||
if fields_present.0 {
|
||||
serializer.serialize_entry(
|
||||
"fields",
|
||||
&SerializableEventFields(event, skipped_field_indices),
|
||||
)?;
|
||||
}
|
||||
|
||||
let pid = std::process::id();
|
||||
if pid != 1 {
|
||||
serializer.serialize_entry("process_id", &pid)?;
|
||||
}
|
||||
|
||||
THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
|
||||
|
||||
// TODO: tls cache? name could change
|
||||
if let Some(thread_name) = std::thread::current().name() {
|
||||
if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
|
||||
serializer.serialize_entry("thread_name", thread_name)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(task_id) = tokio::task::try_id() {
|
||||
serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
|
||||
}
|
||||
|
||||
serializer.serialize_entry("target", meta.target())?;
|
||||
|
||||
if let Some(module) = meta.module_path() {
|
||||
if module != meta.target() {
|
||||
serializer.serialize_entry("module", module)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(file) = meta.file() {
|
||||
if let Some(line) = meta.line() {
|
||||
serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
|
||||
} else {
|
||||
serializer.serialize_entry("src", file)?;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let otel_context = Span::current().context();
|
||||
let otel_spanref = otel_context.span();
|
||||
let span_context = otel_spanref.span_context();
|
||||
if span_context.is_valid() {
|
||||
serializer.serialize_entry(
|
||||
"trace_id",
|
||||
&format_args!("{}", span_context.trace_id()),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
|
||||
|
||||
serializer.end()
|
||||
};
|
||||
|
||||
serialize().map_err(io::Error::other)?;
|
||||
self.logline_buffer.push(b'\n');
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the message field that's mixed will other fields.
|
||||
struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
|
||||
serializer: S,
|
||||
skipped_field_indices: Option<&'a SkippedFieldIndices>,
|
||||
state: Option<Result<(), S::Error>>,
|
||||
}
|
||||
|
||||
impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
|
||||
#[inline]
|
||||
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
|
||||
Self {
|
||||
serializer,
|
||||
skipped_field_indices,
|
||||
state: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn into_serializer(mut self) -> Result<S, S::Error> {
|
||||
match self.state {
|
||||
Some(Ok(())) => {}
|
||||
Some(Err(err)) => return Err(err),
|
||||
None => self.serializer.serialize_value("")?,
|
||||
}
|
||||
Ok(self.serializer)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept_field(&self, field: &tracing::field::Field) -> bool {
|
||||
self.state.is_none()
|
||||
&& field.name() == MESSAGE_FIELD
|
||||
&& !self
|
||||
.skipped_field_indices
|
||||
.is_some_and(|i| i.contains(field.index()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&value));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_error(
|
||||
&mut self,
|
||||
field: &tracing::field::Field,
|
||||
value: &(dyn std::error::Error + 'static),
|
||||
) {
|
||||
if self.accept_field(field) {
|
||||
self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if there's any fields and field values present. If not, the JSON subobject
|
||||
/// can be skipped.
|
||||
// This is entirely optional and only cosmetic, though maybe helps a
|
||||
// bit during log parsing in dashboards when there's no field with empty object.
|
||||
struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
|
||||
|
||||
// Even though some methods have an overhead (error, bytes) it is assumed the
|
||||
// compiler won't include this since we ignore the value entirely.
|
||||
impl tracing::field::Visit for FieldsPresent<'_> {
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
|
||||
if !self.1.is_some_and(|i| i.contains(field.index()))
|
||||
&& field.name() != MESSAGE_FIELD
|
||||
&& !field.name().starts_with("log.")
|
||||
{
|
||||
self.0 |= true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes the fields directly supplied with a log event.
|
||||
struct SerializableEventFields<'a, 'event>(
|
||||
&'a tracing::Event<'event>,
|
||||
Option<&'a SkippedFieldIndices>,
|
||||
);
|
||||
|
||||
impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let serializer = serializer.serialize_map(None)?;
|
||||
let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
|
||||
self.0.record(&mut message_skipper);
|
||||
let serializer = message_skipper.into_serializer()?;
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
|
||||
/// A tracing field visitor that skips the message field.
|
||||
struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
|
||||
serializer: S,
|
||||
skipped_field_indices: Option<&'a SkippedFieldIndices>,
|
||||
state: Result<(), S::Error>,
|
||||
}
|
||||
|
||||
impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
|
||||
#[inline]
|
||||
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
|
||||
Self {
|
||||
serializer,
|
||||
skipped_field_indices,
|
||||
state: Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept_field(&self, field: &tracing::field::Field) -> bool {
|
||||
self.state.is_ok()
|
||||
&& field.name() != MESSAGE_FIELD
|
||||
&& !field.name().starts_with("log.")
|
||||
&& !self
|
||||
.skipped_field_indices
|
||||
.is_some_and(|i| i.contains(field.index()))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn into_serializer(self) -> Result<S, S::Error> {
|
||||
self.state?;
|
||||
Ok(self.serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self
|
||||
.serializer
|
||||
.serialize_entry(field.name(), &format_args!("{value:x?}"));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_entry(field.name(), &value);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self
|
||||
.serializer
|
||||
.serialize_entry(field.name(), &format_args!("{value:?}"));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_error(
|
||||
&mut self,
|
||||
field: &tracing::field::Field,
|
||||
value: &(dyn std::error::Error + 'static),
|
||||
) {
|
||||
if self.accept_field(field) {
|
||||
self.state = self.serializer.serialize_value(&format_args!("{value}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes the span stack from root to leaf (parent of event) enumerated
|
||||
/// inside an object where the keys are just the number padded with zeroes
|
||||
/// to retain sorting order.
|
||||
// The object is necessary because Loki cannot flatten arrays.
|
||||
struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
|
||||
where
|
||||
Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
|
||||
|
||||
impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
|
||||
where
|
||||
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
|
||||
where
|
||||
Ser: serde::ser::Serializer,
|
||||
{
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
|
||||
if let Some(leaf_span) = self.0.lookup_current() {
|
||||
for (i, span) in leaf_span.scope().from_root().enumerate() {
|
||||
serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
|
||||
}
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes a single span. Include the span ID, name and its fields as
|
||||
/// recorded up to this point.
|
||||
struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
|
||||
where
|
||||
Span: for<'lookup> LookupSpan<'lookup>;
|
||||
|
||||
impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
|
||||
where
|
||||
Span: for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
|
||||
where
|
||||
Ser: serde::ser::Serializer,
|
||||
{
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
// TODO: the span ID is probably only useful for debugging tracing.
|
||||
serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
|
||||
serializer.serialize_entry("span_name", self.0.metadata().name())?;
|
||||
|
||||
let ext = self.0.extensions();
|
||||
if let Some(data) = ext.get::<SpanFields>() {
|
||||
for (key, value) in &data.fields.pin() {
|
||||
serializer.serialize_entry(key, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::unwrap_used)]
|
||||
mod tests {
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use assert_json_diff::assert_json_eq;
|
||||
use tracing::info_span;
|
||||
|
||||
use super::*;
|
||||
|
||||
struct TestClock {
|
||||
current_time: Mutex<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
impl Clock for Arc<TestClock> {
|
||||
fn now(&self) -> DateTime<Utc> {
|
||||
*self.current_time.lock().expect("poisoned")
|
||||
}
|
||||
}
|
||||
|
||||
struct VecWriter<'a> {
|
||||
buffer: MutexGuard<'a, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl MakeWriter for Arc<Mutex<Vec<u8>>> {
|
||||
fn make_writer(&self) -> impl io::Write {
|
||||
VecWriter {
|
||||
buffer: self.lock().expect("poisoned"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Write for VecWriter<'_> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.buffer.write(buf)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_collection() {
|
||||
let clock = Arc::new(TestClock {
|
||||
current_time: Mutex::new(Utc::now()),
|
||||
});
|
||||
let buffer = Arc::new(Mutex::new(Vec::new()));
|
||||
let log_layer = JsonLoggingLayer {
|
||||
clock: clock.clone(),
|
||||
skipped_field_indices: papaya::HashMap::default(),
|
||||
writer: buffer.clone(),
|
||||
};
|
||||
|
||||
let registry = tracing_subscriber::Registry::default().with(log_layer);
|
||||
|
||||
tracing::subscriber::with_default(registry, || {
|
||||
info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
|
||||
info_span!("span2").in_scope(|| {
|
||||
tracing::error!(
|
||||
a = 1,
|
||||
a = 2,
|
||||
a = 3,
|
||||
message = "explicit message field",
|
||||
"implicit message field"
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
let buffer = Arc::try_unwrap(buffer)
|
||||
.expect("no other reference")
|
||||
.into_inner()
|
||||
.expect("poisoned");
|
||||
let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON");
|
||||
let expected: serde_json::Value = serde_json::json!(
|
||||
{
|
||||
"timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
|
||||
"level": "ERROR",
|
||||
"message": "explicit message field",
|
||||
"fields": {
|
||||
"a": 3,
|
||||
},
|
||||
"spans": {
|
||||
"00":{
|
||||
"span_id": "0000000000000001",
|
||||
"span_name": "span1",
|
||||
"x": 42,
|
||||
},
|
||||
"01": {
|
||||
"span_id": "0000000000000002",
|
||||
"span_name": "span2",
|
||||
}
|
||||
},
|
||||
"src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
|
||||
"target": "proxy::logging::tests",
|
||||
"process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
|
||||
"thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(),
|
||||
"thread_name": "logging::tests::test_field_collection",
|
||||
}
|
||||
);
|
||||
|
||||
assert_json_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,9 +400,9 @@ fn create_random_jwk() -> (SigningKey, jose_jwk::Key) {
|
||||
pub(crate) enum HttpConnError {
|
||||
#[error("pooled connection closed at inconsistent state")]
|
||||
ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
|
||||
#[error("could not connection to postgres in compute")]
|
||||
#[error("could not connect to postgres in compute")]
|
||||
PostgresConnectionError(#[from] postgres_client::Error),
|
||||
#[error("could not connection to local-proxy in compute")]
|
||||
#[error("could not connect to local-proxy in compute")]
|
||||
LocalProxyConnectionError(#[from] LocalProxyConnError),
|
||||
#[error("could not parse JWT payload")]
|
||||
JwtPayloadError(serde_json::Error),
|
||||
|
||||
@@ -11,10 +11,12 @@ use http_body_util::{BodyExt, Full};
|
||||
use hyper::body::Incoming;
|
||||
use hyper::http::{HeaderName, HeaderValue};
|
||||
use hyper::{header, HeaderMap, Request, Response, StatusCode};
|
||||
use indexmap::IndexMap;
|
||||
use postgres_client::error::{DbError, ErrorPosition, SqlState};
|
||||
use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
|
||||
use pq_proto::StartupMessageParamsBuilder;
|
||||
use serde::Serialize;
|
||||
use serde_json::value::RawValue;
|
||||
use serde_json::Value;
|
||||
use tokio::time::{self, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -249,6 +251,50 @@ pub(crate) async fn handle(
|
||||
let mut response = match result {
|
||||
Ok(r) => {
|
||||
ctx.set_success();
|
||||
|
||||
// Handling the error response from local proxy here
|
||||
if config.authentication_config.is_auth_broker && r.status().is_server_error() {
|
||||
let status = r.status();
|
||||
|
||||
let body_bytes = r
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::Error::msg(format!(
|
||||
"could not collect http body: {e}"
|
||||
)))
|
||||
})?
|
||||
.to_bytes();
|
||||
|
||||
if let Ok(mut json_map) =
|
||||
serde_json::from_slice::<IndexMap<&str, &RawValue>>(&body_bytes)
|
||||
{
|
||||
let message = json_map.get("message");
|
||||
if let Some(message) = message {
|
||||
let msg: String = match serde_json::from_str(message.get()) {
|
||||
Ok(msg) => msg,
|
||||
Err(_) => {
|
||||
"Unable to parse the response message from server".to_string()
|
||||
}
|
||||
};
|
||||
|
||||
error!("Error response from local_proxy: {status} {msg}");
|
||||
|
||||
json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys
|
||||
|
||||
let resp_json = serde_json::to_string(&json_map)
|
||||
.unwrap_or("failed to serialize the response message".to_string());
|
||||
|
||||
return json_response(status, resp_json);
|
||||
}
|
||||
}
|
||||
|
||||
error!("Unable to parse the response message from local_proxy");
|
||||
return json_response(
|
||||
status,
|
||||
json!({ "message": "Unable to parse the response message from server".to_string() }),
|
||||
);
|
||||
}
|
||||
r
|
||||
}
|
||||
Err(e @ SqlOverHttpError::Cancelled(_)) => {
|
||||
@@ -618,8 +664,6 @@ async fn handle_db_inner(
|
||||
|
||||
let authenticate_and_connect = Box::pin(
|
||||
async {
|
||||
let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_));
|
||||
|
||||
let keys = match auth {
|
||||
AuthData::Password(pw) => {
|
||||
backend
|
||||
@@ -634,7 +678,9 @@ async fn handle_db_inner(
|
||||
};
|
||||
|
||||
let client = match keys.keys {
|
||||
ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => {
|
||||
ComputeCredentialKeys::JwtPayload(payload)
|
||||
if backend.auth_backend.is_local_proxy() =>
|
||||
{
|
||||
let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?;
|
||||
let (cli_inner, _dsc) = client.client_inner();
|
||||
cli_inner.set_jwt_session(&payload).await?;
|
||||
|
||||
@@ -15,7 +15,8 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::sync::mpsc::error::SendError;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::MissedTickBehavior;
|
||||
use tracing::{info_span, Instrument};
|
||||
use tracing::{error, info, info_span, Instrument};
|
||||
use utils::critical;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::Compression;
|
||||
use utils::postgres_client::InterpretedFormat;
|
||||
@@ -120,6 +121,20 @@ pub enum InterpretedWalReaderError {
|
||||
WalStreamClosed,
|
||||
}
|
||||
|
||||
enum CurrentPositionUpdate {
|
||||
Reset(Lsn),
|
||||
NotReset(Lsn),
|
||||
}
|
||||
|
||||
impl CurrentPositionUpdate {
|
||||
fn current_position(&self) -> Lsn {
|
||||
match self {
|
||||
CurrentPositionUpdate::Reset(lsn) => *lsn,
|
||||
CurrentPositionUpdate::NotReset(lsn) => *lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InterpretedWalReaderState {
|
||||
fn current_position(&self) -> Option<Lsn> {
|
||||
match self {
|
||||
@@ -129,6 +144,26 @@ impl InterpretedWalReaderState {
|
||||
InterpretedWalReaderState::Done => None,
|
||||
}
|
||||
}
|
||||
|
||||
// Reset the current position of the WAL reader if the requested starting position
|
||||
// of the new shard is smaller than the current value.
|
||||
fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate {
|
||||
match self {
|
||||
InterpretedWalReaderState::Running {
|
||||
current_position, ..
|
||||
} => {
|
||||
if new_shard_start_pos < *current_position {
|
||||
*current_position = new_shard_start_pos;
|
||||
CurrentPositionUpdate::Reset(*current_position)
|
||||
} else {
|
||||
CurrentPositionUpdate::NotReset(*current_position)
|
||||
}
|
||||
}
|
||||
InterpretedWalReaderState::Done => {
|
||||
panic!("maybe_reset called on finished reader")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct AttachShardNotification {
|
||||
@@ -179,11 +214,10 @@ impl InterpretedWalReader {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = reader.run_impl(start_pos).await;
|
||||
if let Err(ref err) = res {
|
||||
tracing::error!("Task finished with error: {err}");
|
||||
}
|
||||
res
|
||||
reader
|
||||
.run_impl(start_pos)
|
||||
.await
|
||||
.inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
|
||||
}
|
||||
.instrument(info_span!("interpreted wal reader")),
|
||||
);
|
||||
@@ -239,11 +273,10 @@ impl InterpretedWalReader {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = self.run_impl(start_pos).await;
|
||||
if let Err(err) = res {
|
||||
tracing::error!("Interpreted wal reader encountered error: {err}");
|
||||
if let Err(err) = self.run_impl(start_pos).await {
|
||||
critical!("failed to read WAL record: {err:?}");
|
||||
} else {
|
||||
tracing::info!("Interpreted wal reader exiting");
|
||||
info!("interpreted wal reader exiting");
|
||||
}
|
||||
|
||||
Err(CopyStreamHandlerEnd::Other(anyhow!(
|
||||
@@ -410,15 +443,24 @@ impl InterpretedWalReader {
|
||||
};
|
||||
|
||||
senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
|
||||
let current_pos = self.state.read().unwrap().current_position().unwrap();
|
||||
if start_pos < current_pos {
|
||||
self.wal_stream.reset(start_pos).await;
|
||||
wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
}
|
||||
|
||||
// If the shard is subscribing below the current position the we need
|
||||
// to update the cursor that tracks where we are at in the WAL
|
||||
// ([`Self::state`]) and reset the WAL stream itself
|
||||
// (`[Self::wal_stream`]). This must be done atomically from the POV of
|
||||
// anything outside the select statement.
|
||||
let position_reset = self.state.write().unwrap().maybe_reset(start_pos);
|
||||
match position_reset {
|
||||
CurrentPositionUpdate::Reset(to) => {
|
||||
self.wal_stream.reset(to).await;
|
||||
wal_decoder = WalStreamDecoder::new(to, self.pg_version);
|
||||
},
|
||||
CurrentPositionUpdate::NotReset(_) => {}
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Added shard sender {} with start_pos={} current_pos={}",
|
||||
ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
|
||||
ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -584,7 +626,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
@@ -715,7 +757,6 @@ mod tests {
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
const ATTACHED_SHARDS: u8 = 4;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
@@ -725,9 +766,11 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut next_record_lsns = Vec::default();
|
||||
let end_watch =
|
||||
Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
@@ -746,38 +789,71 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
let mut batch_receivers = vec![rx];
|
||||
struct Sender {
|
||||
tx: Option<tokio::sync::mpsc::Sender<Batch>>,
|
||||
rx: tokio::sync::mpsc::Receiver<Batch>,
|
||||
shard: ShardIdentity,
|
||||
start_lsn: Lsn,
|
||||
received_next_record_lsns: Vec<Lsn>,
|
||||
}
|
||||
|
||||
impl Sender {
|
||||
fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
Self {
|
||||
tx: Some(tx),
|
||||
rx,
|
||||
shard,
|
||||
start_lsn,
|
||||
received_next_record_lsns: Vec::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert!(next_record_lsns.len() > 7);
|
||||
let start_lsns = vec![
|
||||
next_record_lsns[5],
|
||||
next_record_lsns[1],
|
||||
next_record_lsns[3],
|
||||
];
|
||||
let mut senders = start_lsns
|
||||
.into_iter()
|
||||
.map(|lsn| Sender::new(lsn, shard_0))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let first_sender = senders.first_mut().unwrap();
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
tx,
|
||||
shard_0,
|
||||
first_sender.start_lsn,
|
||||
first_sender.tx.take().unwrap(),
|
||||
first_sender.shard,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
for _ in 0..(ATTACHED_SHARDS - 1) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
handle.fanout(shard_0, tx, start_lsn).unwrap();
|
||||
batch_receivers.push(rx);
|
||||
for sender in senders.iter_mut().skip(1) {
|
||||
handle
|
||||
.fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
loop {
|
||||
let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
|
||||
for rx in batch_receivers.iter_mut().skip(1) {
|
||||
let other_batch = rx.recv().await.unwrap();
|
||||
|
||||
assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
|
||||
assert_eq!(
|
||||
batch.available_wal_end_lsn,
|
||||
other_batch.available_wal_end_lsn
|
||||
for sender in senders.iter_mut() {
|
||||
loop {
|
||||
let batch = sender.rx.recv().await.unwrap();
|
||||
tracing::info!(
|
||||
"Sender with start_lsn={} received batch ending at {} with {} records",
|
||||
sender.start_lsn,
|
||||
batch.wal_end_lsn,
|
||||
batch.records.records.len()
|
||||
);
|
||||
}
|
||||
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
for rec in batch.records.records {
|
||||
sender.received_next_record_lsns.push(rec.next_record_lsn);
|
||||
}
|
||||
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -792,5 +868,20 @@ mod tests {
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
|
||||
for sender in senders {
|
||||
tracing::info!(
|
||||
"Validating records received by sender with start_lsn={}",
|
||||
sender.start_lsn
|
||||
);
|
||||
|
||||
assert!(sender.received_next_record_lsns.is_sorted());
|
||||
let expected = next_record_lsns
|
||||
.iter()
|
||||
.filter(|lsn| **lsn > sender.start_lsn)
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(sender.received_next_record_lsns, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,6 +122,7 @@ impl Env {
|
||||
start_lsn: Lsn,
|
||||
msg_size: usize,
|
||||
msg_count: usize,
|
||||
mut next_record_lsns: Option<&mut Vec<Lsn>>,
|
||||
) -> anyhow::Result<EndWatch> {
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
|
||||
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
|
||||
@@ -130,7 +131,7 @@ impl Env {
|
||||
|
||||
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
|
||||
|
||||
let prefix = c"p";
|
||||
let prefix = c"neon-file:";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(msg_size >= prefixlen);
|
||||
let message = vec![0; msg_size - prefixlen];
|
||||
@@ -139,6 +140,9 @@ impl Env {
|
||||
&mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
|
||||
for _ in 0..msg_count {
|
||||
let (lsn, record) = walgen.next().unwrap();
|
||||
if let Some(ref mut lsns) = next_record_lsns {
|
||||
lsns.push(lsn);
|
||||
}
|
||||
|
||||
let req = AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
|
||||
@@ -592,6 +592,8 @@ impl Timeline {
|
||||
assert!(self.cancel.is_cancelled());
|
||||
assert!(self.gate.close_complete());
|
||||
|
||||
info!("deleting timeline {} from disk", self.ttid);
|
||||
|
||||
// Close associated FDs. Nobody will be able to touch timeline data once
|
||||
// it is cancelled, so WAL storage won't be opened again.
|
||||
shared_state.sk.close_wal_store();
|
||||
|
||||
@@ -475,6 +475,8 @@ impl GlobalTimelines {
|
||||
info!("deleting timeline {}, only_local={}", ttid, only_local);
|
||||
timeline.shutdown().await;
|
||||
|
||||
info!("timeline {ttid} shut down for deletion");
|
||||
|
||||
// Take a lock and finish the deletion holding this mutex.
|
||||
let mut shared_state = timeline.write_shared_state().await;
|
||||
|
||||
|
||||
@@ -246,7 +246,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
@@ -32,6 +32,7 @@ CREATE TABLE IF NOT EXISTS results (
|
||||
flaky BOOLEAN NOT NULL,
|
||||
arch arch DEFAULT 'X64',
|
||||
lfc BOOLEAN DEFAULT false NOT NULL,
|
||||
sanitizers BOOLEAN DEFAULT false NOT NULL,
|
||||
build_type TEXT NOT NULL,
|
||||
pg_version INT NOT NULL,
|
||||
run_id BIGINT NOT NULL,
|
||||
@@ -39,7 +40,7 @@ CREATE TABLE IF NOT EXISTS results (
|
||||
reference TEXT NOT NULL,
|
||||
revision CHAR(40) NOT NULL,
|
||||
raw JSONB COMPRESSION lz4 NOT NULL,
|
||||
UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
|
||||
UNIQUE (parent_suite, suite, name, arch, lfc, sanitizers, build_type, pg_version, started_at, stopped_at, run_id)
|
||||
);
|
||||
"""
|
||||
|
||||
@@ -56,6 +57,7 @@ class Row:
|
||||
flaky: bool
|
||||
arch: str
|
||||
lfc: bool
|
||||
sanitizers: bool
|
||||
build_type: str
|
||||
pg_version: int
|
||||
run_id: int
|
||||
@@ -135,6 +137,7 @@ def ingest_test_result(
|
||||
}
|
||||
arch = parameters.get("arch", "UNKNOWN").strip("'")
|
||||
lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc"
|
||||
sanitizers = parameters.get("sanitizers", "disabled").strip("'") == "enabled"
|
||||
|
||||
build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
|
||||
labels = {label["name"]: label["value"] for label in test["labels"]}
|
||||
@@ -149,6 +152,7 @@ def ingest_test_result(
|
||||
flaky=test["flaky"] or test["retriesStatusChange"],
|
||||
arch=arch,
|
||||
lfc=lfc,
|
||||
sanitizers=sanitizers,
|
||||
build_type=build_type,
|
||||
pg_version=pg_version,
|
||||
run_id=run_id,
|
||||
|
||||
@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
|
||||
// We shutdown while sending
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
// A response indicates we will never succeed, such as 400 or 404
|
||||
// A response indicates we will never succeed, such as 400 or 403
|
||||
#[error("Non-retryable error {0}")]
|
||||
Fatal(StatusCode),
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ use pageserver_api::shard::ShardConfigError;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use rustls::client::danger::ServerCertVerifier;
|
||||
use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
|
||||
use rustls::client::WebPkiServerVerifier;
|
||||
use rustls::crypto::ring;
|
||||
use scoped_futures::ScopedBoxFuture;
|
||||
@@ -194,6 +194,8 @@ impl Persistence {
|
||||
timeout: Duration,
|
||||
) -> Result<(), diesel::ConnectionError> {
|
||||
let started_at = Instant::now();
|
||||
log_postgres_connstr_info(database_url)
|
||||
.map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
|
||||
loop {
|
||||
match establish_connection_rustls(database_url).await {
|
||||
Ok(_) => {
|
||||
@@ -1281,6 +1283,51 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
|
||||
Ok(Arc::new(store))
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// A verifier that accepts all certificates (but logs an error still)
|
||||
struct AcceptAll(Arc<WebPkiServerVerifier>);
|
||||
impl ServerCertVerifier for AcceptAll {
|
||||
fn verify_server_cert(
|
||||
&self,
|
||||
end_entity: &rustls::pki_types::CertificateDer<'_>,
|
||||
intermediates: &[rustls::pki_types::CertificateDer<'_>],
|
||||
server_name: &rustls::pki_types::ServerName<'_>,
|
||||
ocsp_response: &[u8],
|
||||
now: rustls::pki_types::UnixTime,
|
||||
) -> Result<ServerCertVerified, rustls::Error> {
|
||||
let r =
|
||||
self.0
|
||||
.verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
?server_name,
|
||||
"ignoring db connection TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(ServerCertVerified::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn verify_tls12_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||
self.0.verify_tls12_signature(message, cert, dss)
|
||||
}
|
||||
fn verify_tls13_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||
self.0.verify_tls13_signature(message, cert, dss)
|
||||
}
|
||||
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
|
||||
self.0.supported_verify_schemes()
|
||||
}
|
||||
}
|
||||
|
||||
/// Loads the root certificates and constructs a client config suitable for connecting.
|
||||
/// This function is blocking.
|
||||
fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
|
||||
@@ -1290,76 +1337,12 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
|
||||
.expect("ring should support the default protocol versions");
|
||||
static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
|
||||
let do_cert_checks =
|
||||
DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
|
||||
DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok());
|
||||
Ok(if *do_cert_checks {
|
||||
client_config
|
||||
.with_root_certificates(load_certs()?)
|
||||
.with_no_client_auth()
|
||||
} else {
|
||||
use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
|
||||
#[derive(Debug)]
|
||||
struct AcceptAll(Arc<WebPkiServerVerifier>);
|
||||
impl ServerCertVerifier for AcceptAll {
|
||||
fn verify_server_cert(
|
||||
&self,
|
||||
end_entity: &rustls::pki_types::CertificateDer<'_>,
|
||||
intermediates: &[rustls::pki_types::CertificateDer<'_>],
|
||||
server_name: &rustls::pki_types::ServerName<'_>,
|
||||
ocsp_response: &[u8],
|
||||
now: rustls::pki_types::UnixTime,
|
||||
) -> Result<ServerCertVerified, rustls::Error> {
|
||||
let r = self.0.verify_server_cert(
|
||||
end_entity,
|
||||
intermediates,
|
||||
server_name,
|
||||
ocsp_response,
|
||||
now,
|
||||
);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
?server_name,
|
||||
"ignoring db connection TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(ServerCertVerified::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn verify_tls12_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
|
||||
{
|
||||
let r = self.0.verify_tls12_signature(message, cert, dss);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
"ignoring db connection 1.2 signature TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(HandshakeSignatureValid::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn verify_tls13_signature(
|
||||
&self,
|
||||
message: &[u8],
|
||||
cert: &rustls::pki_types::CertificateDer<'_>,
|
||||
dss: &rustls::DigitallySignedStruct,
|
||||
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
|
||||
{
|
||||
let r = self.0.verify_tls13_signature(message, cert, dss);
|
||||
if let Err(err) = r {
|
||||
tracing::info!(
|
||||
"ignoring db connection 1.3 signature TLS validation error: {err:?}"
|
||||
);
|
||||
return Ok(HandshakeSignatureValid::assertion());
|
||||
}
|
||||
r
|
||||
}
|
||||
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
|
||||
self.0.supported_verify_schemes()
|
||||
}
|
||||
}
|
||||
let verifier = AcceptAll(
|
||||
WebPkiServerVerifier::builder_with_provider(
|
||||
load_certs()?,
|
||||
@@ -1389,6 +1372,29 @@ fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<Async
|
||||
fut.boxed()
|
||||
}
|
||||
|
||||
#[cfg_attr(test, test)]
|
||||
fn test_config_debug_censors_password() {
|
||||
let has_pw =
|
||||
"host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'";
|
||||
let has_pw_cfg = has_pw.parse::<tokio_postgres::Config>().unwrap();
|
||||
assert!(format!("{has_pw_cfg:?}").contains("specialuser"));
|
||||
// Ensure that the password is not leaked by the debug impl
|
||||
assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG"));
|
||||
}
|
||||
|
||||
fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> {
|
||||
let config = config_str
|
||||
.parse::<tokio_postgres::Config>()
|
||||
.map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?;
|
||||
// We use debug formatting here, and use a unit test to ensure that we don't leak the password.
|
||||
// To make extra sure the test gets ran, run it every time the function is called
|
||||
// (this is rather cold code, we can afford it).
|
||||
#[cfg(not(test))]
|
||||
test_config_debug_censors_password();
|
||||
tracing::info!("database connection config: {config:?}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
#[derive(
|
||||
QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,
|
||||
|
||||
@@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
|
||||
Self {
|
||||
config: ReconcilerConfig {
|
||||
tenant_creation_hint: hint,
|
||||
..self.config
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> ReconcilerConfig {
|
||||
self.config
|
||||
}
|
||||
@@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig {
|
||||
// During live migrations this is the amount of time that
|
||||
// the pagserver will hold our poll.
|
||||
secondary_download_request_timeout: Option<Duration>,
|
||||
|
||||
// A hint indicating whether this reconciliation is done on the
|
||||
// creation of a new tenant. This only informs logging behaviour.
|
||||
tenant_creation_hint: bool,
|
||||
}
|
||||
|
||||
impl ReconcilerConfig {
|
||||
@@ -143,6 +156,10 @@ impl ReconcilerConfig {
|
||||
self.secondary_download_request_timeout
|
||||
.unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_creation_hint(&self) -> bool {
|
||||
self.tenant_creation_hint
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
|
||||
@@ -934,16 +951,35 @@ impl Reconciler {
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = &result {
|
||||
// It is up to the caller whether they want to drop out on this error, but they don't have to:
|
||||
// in general we should avoid letting unavailability of the cloud control plane stop us from
|
||||
// making progress.
|
||||
if !matches!(e, NotifyError::ShuttingDown) {
|
||||
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
|
||||
}
|
||||
|
||||
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
|
||||
// needs to retry at some point.
|
||||
self.compute_notify_failure = true;
|
||||
|
||||
// It is up to the caller whether they want to drop out on this error, but they don't have to:
|
||||
// in general we should avoid letting unavailability of the cloud control plane stop us from
|
||||
// making progress.
|
||||
match e {
|
||||
// 404s from cplane during tenant creation are expected.
|
||||
// Cplane only persists the shards to the database after
|
||||
// creating the tenant and the timeline. If we notify before
|
||||
// that, we'll get a 404.
|
||||
//
|
||||
// This is fine because tenant creations happen via /location_config
|
||||
// and that returns the list of locations in the response. Hence, we
|
||||
// silence the error and return Ok(()) here. Reconciliation will still
|
||||
// be retried because we set [`Reconciler::compute_notify_failure`] above.
|
||||
NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
|
||||
if self.reconciler_config.tenant_creation_hint() =>
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
NotifyError::ShuttingDown => {}
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Failed to notify compute of attached pageserver {node}: {e}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
} else {
|
||||
|
||||
@@ -2238,9 +2238,14 @@ impl Service {
|
||||
let waiters = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
let config = ReconcilerConfigBuilder::new()
|
||||
.tenant_creation_hint(true)
|
||||
.build();
|
||||
tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
.filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
|
||||
.filter_map(|(_shard_id, shard)| {
|
||||
self.maybe_configured_reconcile_shard(shard, nodes, config)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
|
||||
@@ -707,6 +707,7 @@ impl TenantShard {
|
||||
if let Some(node_id) = self.intent.get_attached() {
|
||||
// Populate secondary by demoting the attached node
|
||||
self.intent.demote_attached(scheduler, *node_id);
|
||||
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
@@ -979,24 +980,51 @@ impl TenantShard {
|
||||
),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
if secondary_scores.iter().any(|score| score.1.is_none()) {
|
||||
// Don't have full list of scores, so can't make a good decision about which to drop unless
|
||||
// there is an obvious one in the wrong AZ
|
||||
for secondary in self.intent.get_secondary() {
|
||||
if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
|
||||
// Trivial case: if we only have one secondary, drop that one
|
||||
if self.intent.get_secondary().len() == 1 {
|
||||
return Some(ScheduleOptimization {
|
||||
sequence: self.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(
|
||||
*self.intent.get_secondary().first().unwrap(),
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
// Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state
|
||||
// where its score can't be calculated), and drop the others. This enables us to make progress in
|
||||
// most cases, even if some nodes are offline or have scheduling=pause set.
|
||||
|
||||
debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
|
||||
// logic presumes we are in a mode where we want secondaries to be in non-home AZ
|
||||
if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
|
||||
let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
|
||||
let is_available = secondary_scores
|
||||
.get(n)
|
||||
.expect("Built from same list of nodes")
|
||||
.is_some();
|
||||
is_available && !in_home_az
|
||||
}) {
|
||||
// Great, we found one to retain. Pick some other to drop.
|
||||
if let Some(victim) = self
|
||||
.intent
|
||||
.get_secondary()
|
||||
.iter()
|
||||
.find(|n| n != &retain_secondary)
|
||||
{
|
||||
return Some(ScheduleOptimization {
|
||||
sequence: self.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(*victim),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through: we didn't identify one to remove. This ought to be rare.
|
||||
tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
} else {
|
||||
let victim = secondary_scores
|
||||
.iter()
|
||||
@@ -1005,7 +1033,7 @@ impl TenantShard {
|
||||
.0;
|
||||
return Some(ScheduleOptimization {
|
||||
sequence: self.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(victim),
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(*victim),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -2379,6 +2407,110 @@ pub(crate) mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test how the optimisation code behaves with an extra secondary
|
||||
#[test]
|
||||
fn optimize_removes_secondary() -> anyhow::Result<()> {
|
||||
let az_a_tag = AvailabilityZone("az-a".to_string());
|
||||
let az_b_tag = AvailabilityZone("az-b".to_string());
|
||||
let mut nodes = make_test_nodes(
|
||||
4,
|
||||
&[
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
],
|
||||
);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
shard_a.intent.preferred_az_id = Some(az_a_tag.clone());
|
||||
shard_a
|
||||
.schedule(&mut scheduler, &mut schedule_context)
|
||||
.unwrap();
|
||||
|
||||
// Attached on node 1, secondary on node 2
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]);
|
||||
|
||||
// Initially optimiser is idle
|
||||
assert_eq!(
|
||||
shard_a.optimize_attachment(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
shard_a.optimize_secondary(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
|
||||
// A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over
|
||||
// to our new location
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization,
|
||||
Some(ScheduleOptimization {
|
||||
sequence: shard_a.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
|
||||
})
|
||||
);
|
||||
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
|
||||
|
||||
// A spare secondary in the non-home AZ, and one of them is offline
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(4));
|
||||
nodes
|
||||
.get_mut(&NodeId(4))
|
||||
.unwrap()
|
||||
.set_availability(NodeAvailability::Offline);
|
||||
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
|
||||
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization,
|
||||
Some(ScheduleOptimization {
|
||||
sequence: shard_a.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4))
|
||||
})
|
||||
);
|
||||
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
|
||||
|
||||
// A spare secondary when should have none
|
||||
shard_a.policy = PlacementPolicy::Attached(0);
|
||||
let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization,
|
||||
Some(ScheduleOptimization {
|
||||
sequence: shard_a.sequence,
|
||||
action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
|
||||
})
|
||||
);
|
||||
shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![]);
|
||||
|
||||
// Check that in secondary mode, we preserve the secondary in the preferred AZ
|
||||
let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule()
|
||||
shard_a.policy = PlacementPolicy::Secondary;
|
||||
shard_a
|
||||
.schedule(&mut scheduler, &mut schedule_context)
|
||||
.unwrap();
|
||||
assert_eq!(shard_a.intent.get_attached(), &None);
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
assert_eq!(
|
||||
shard_a.optimize_attachment(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
shard_a.optimize_secondary(&mut scheduler, &schedule_context),
|
||||
None
|
||||
);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Optimize til quiescent: this emulates what Service::optimize_all does, when
|
||||
// called repeatedly in the background.
|
||||
// Returns the applied optimizations
|
||||
|
||||
@@ -2766,6 +2766,11 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
|
||||
raise
|
||||
|
||||
def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any:
|
||||
path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json"
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
def tenant_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
@@ -3340,7 +3345,7 @@ class NeonProxy(PgProtocol):
|
||||
metric_collection_interval: str | None = None,
|
||||
):
|
||||
host = "127.0.0.1"
|
||||
domain = "proxy.localtest.me" # resolves to 127.0.0.1
|
||||
domain = "proxy.local.neon.build" # resolves to 127.0.0.1
|
||||
super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
|
||||
|
||||
self.domain = domain
|
||||
@@ -3363,7 +3368,7 @@ class NeonProxy(PgProtocol):
|
||||
# generate key of it doesn't exist
|
||||
crt_path = self.test_output_dir / "proxy.crt"
|
||||
key_path = self.test_output_dir / "proxy.key"
|
||||
generate_proxy_tls_certs("*.localtest.me", key_path, crt_path)
|
||||
generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path)
|
||||
|
||||
args = [
|
||||
str(self.neon_binpath / "proxy"),
|
||||
@@ -3564,7 +3569,7 @@ class NeonAuthBroker:
|
||||
external_http_port: int,
|
||||
auth_backend: NeonAuthBroker.ProxyV1,
|
||||
):
|
||||
self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1
|
||||
self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1
|
||||
self.host = "127.0.0.1"
|
||||
self.http_port = http_port
|
||||
self.external_http_port = external_http_port
|
||||
@@ -3581,7 +3586,7 @@ class NeonAuthBroker:
|
||||
# generate key of it doesn't exist
|
||||
crt_path = self.test_output_dir / "proxy.crt"
|
||||
key_path = self.test_output_dir / "proxy.key"
|
||||
generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path)
|
||||
generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path)
|
||||
|
||||
args = [
|
||||
str(self.neon_binpath / "proxy"),
|
||||
@@ -5117,12 +5122,14 @@ def wait_for_last_flush_lsn(
|
||||
timeline: TimelineId,
|
||||
pageserver_id: int | None = None,
|
||||
auth_token: str | None = None,
|
||||
last_flush_lsn: Lsn | None = None,
|
||||
) -> Lsn:
|
||||
"""Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
|
||||
|
||||
shards = tenant_get_shards(env, tenant, pageserver_id)
|
||||
|
||||
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
if last_flush_lsn is None:
|
||||
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
|
||||
results = []
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
|
||||
@@ -124,5 +124,8 @@ def pytest_runtest_makereport(*args, **kwargs):
|
||||
allure.dynamic.parameter(
|
||||
"__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc"
|
||||
)
|
||||
allure.dynamic.parameter(
|
||||
"__sanitizers", "enabled" if os.getenv("SANITIZERS") == "enabled" else "disabled"
|
||||
)
|
||||
|
||||
yield
|
||||
|
||||
@@ -282,18 +282,35 @@ class S3Storage:
|
||||
def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
|
||||
|
||||
def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
|
||||
"""
|
||||
Gets the latest generation key from a list of keys.
|
||||
|
||||
@param index_keys: A list of keys of different generations, which start with `prefix`
|
||||
"""
|
||||
|
||||
def parse_gen(key: str) -> int:
|
||||
shortname = key.split("/")[-1]
|
||||
generation_str = shortname.removeprefix(prefix).removesuffix(suffix)
|
||||
try:
|
||||
return int(generation_str, base=16)
|
||||
except ValueError:
|
||||
log.info(f"Ignoring non-matching key: {key}")
|
||||
return -1
|
||||
|
||||
if len(keys) == 0:
|
||||
raise IndexError("No keys found")
|
||||
|
||||
return max(keys, key=parse_gen)
|
||||
|
||||
def get_latest_index_key(self, index_keys: list[str]) -> str:
|
||||
"""
|
||||
Gets the latest index file key.
|
||||
|
||||
@param index_keys: A list of index keys of different generations.
|
||||
"""
|
||||
|
||||
def parse_gen(index_key: str) -> int:
|
||||
parts = index_key.split("index_part.json-")
|
||||
return int(parts[-1], base=16) if len(parts) == 2 else -1
|
||||
|
||||
return max(index_keys, key=parse_gen)
|
||||
key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys)
|
||||
return key
|
||||
|
||||
def download_index_part(self, index_key: str) -> IndexPartDump:
|
||||
"""
|
||||
@@ -306,6 +323,29 @@ class S3Storage:
|
||||
log.info(f"index_part.json: {body}")
|
||||
return IndexPartDump.from_json(json.loads(body))
|
||||
|
||||
def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None:
|
||||
tenant_prefix = self.tenant_path(tenant_id)
|
||||
|
||||
objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[
|
||||
"Contents"
|
||||
]
|
||||
keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1]
|
||||
try:
|
||||
manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys)
|
||||
except IndexError:
|
||||
log.info(
|
||||
f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet"
|
||||
)
|
||||
return None
|
||||
|
||||
response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key)
|
||||
body = response["Body"].read().decode("utf-8")
|
||||
log.info(f"Downloaded manifest {manifest_key}: {body}")
|
||||
|
||||
manifest = json.loads(body)
|
||||
assert isinstance(manifest, dict)
|
||||
return manifest
|
||||
|
||||
def heatmap_key(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
|
||||
|
||||
|
||||
@@ -76,6 +76,9 @@ def test_ingest_logical_message(
|
||||
log.info("Waiting for Pageserver to catch up")
|
||||
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
|
||||
|
||||
recover_to_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
|
||||
endpoint.stop()
|
||||
|
||||
# Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
|
||||
# reingest all the WAL from the safekeeper without any other constraints. This gives us a
|
||||
# baseline of how fast the pageserver can ingest this WAL in isolation.
|
||||
@@ -88,7 +91,13 @@ def test_ingest_logical_message(
|
||||
with zenbenchmark.record_duration("pageserver_recover_ingest"):
|
||||
log.info("Recovering WAL into pageserver")
|
||||
client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
wait_for_last_flush_lsn(
|
||||
env, endpoint, env.initial_tenant, env.initial_timeline, last_flush_lsn=recover_to_lsn
|
||||
)
|
||||
|
||||
# Check endpoint can start, i.e. we really recovered
|
||||
endpoint.start()
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# Emit metrics.
|
||||
wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
|
||||
|
||||
@@ -34,16 +34,20 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
|
||||
cur.execute("set log_statement = 'all'")
|
||||
cur.execute("create table t(x integer)")
|
||||
for _ in range(n_iters):
|
||||
cur.execute(f"insert into t values (generate_series(1,{n_records}))")
|
||||
with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"):
|
||||
cur.execute(f"insert into t values (generate_series(1,{n_records}))")
|
||||
time.sleep(1)
|
||||
|
||||
cur.execute("vacuum t")
|
||||
with zenbenchmark.record_duration("vacuum t"):
|
||||
cur.execute("vacuum t")
|
||||
|
||||
with zenbenchmark.record_duration("test_query"):
|
||||
with zenbenchmark.record_duration("SELECT count(*) from t"):
|
||||
cur.execute("SELECT count(*) from t")
|
||||
assert cur.fetchone() == (n_iters * n_records,)
|
||||
|
||||
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
tenant, timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
with zenbenchmark.record_duration("flush_ep_to_pageserver"):
|
||||
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
|
||||
with zenbenchmark.record_duration("timeline_checkpoint"):
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
tenant, timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
|
||||
"LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
|
||||
"PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
|
||||
"PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
|
||||
"PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
|
||||
"PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
|
||||
}
|
||||
# Combine the current environment with custom variables
|
||||
env = os.environ.copy()
|
||||
|
||||
@@ -29,6 +29,21 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
|
||||
# "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
|
||||
}
|
||||
|
||||
PREEMPT_COMPACTION_TENANT_CONF = {
|
||||
"gc_period": "5s",
|
||||
"compaction_period": "5s",
|
||||
# Small checkpoint distance to create many layers
|
||||
"checkpoint_distance": 1024**2,
|
||||
# Compact small layers
|
||||
"compaction_target_size": 1024**2,
|
||||
"image_creation_threshold": 1,
|
||||
"image_creation_preempt_threshold": 1,
|
||||
# compact more frequently
|
||||
"compaction_threshold": 3,
|
||||
"compaction_upper_limit": 6,
|
||||
"lsn_lease_length": "0s",
|
||||
}
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.parametrize(
|
||||
@@ -36,7 +51,8 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
|
||||
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
|
||||
)
|
||||
def test_pageserver_compaction_smoke(
|
||||
neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
wal_receiver_protocol: PageserverWalReceiverProtocol,
|
||||
):
|
||||
"""
|
||||
This is a smoke test that compaction kicks in. The workload repeatedly churns
|
||||
@@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke(
|
||||
page_cache_size=10
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
|
||||
conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
@@ -113,6 +130,41 @@ page_cache_size=10
|
||||
assert vectored_average < 8
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
def test_pageserver_compaction_preempt(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
# Ideally we should be able to do unit tests for this, but we need real Postgres
|
||||
# WALs in order to do unit testing...
|
||||
|
||||
conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
row_count = 200000
|
||||
churn_rounds = 10
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(env.pageserver.id)
|
||||
|
||||
log.info("Writing initial data ...")
|
||||
workload.write_rows(row_count, env.pageserver.id)
|
||||
|
||||
for i in range(1, churn_rounds + 1):
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
workload.churn_rows(row_count, env.pageserver.id, upload=False)
|
||||
workload.validate(env.pageserver.id)
|
||||
ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
# ensure image layer creation gets preempted and then resumed
|
||||
env.pageserver.assert_log_contains("resuming image layer creation")
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.parametrize(
|
||||
"with_branches",
|
||||
@@ -250,6 +302,9 @@ def test_pageserver_gc_compaction_idempotent(
|
||||
workload.churn_rows(row_count, env.pageserver.id)
|
||||
# compact 3 times if mode is before_restart
|
||||
n_compactions = 3 if compaction_mode == "before_restart" else 1
|
||||
ps_http.timeline_compact(
|
||||
tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True
|
||||
)
|
||||
for _ in range(n_compactions):
|
||||
# Force refresh gc info to have gc_cutoff generated
|
||||
ps_http.timeline_gc(tenant_id, timeline_id, None)
|
||||
|
||||
@@ -314,7 +314,10 @@ def test_forward_compatibility(
|
||||
|
||||
|
||||
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
|
||||
ep = env.endpoints.create_start("main")
|
||||
ep = env.endpoints.create("main")
|
||||
ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")}
|
||||
ep.start(env=ep_env)
|
||||
|
||||
connstr = ep.connstr()
|
||||
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
@@ -363,7 +366,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
|
||||
)
|
||||
|
||||
# Timeline exists again: restart the endpoint
|
||||
ep.start()
|
||||
ep.start(env=ep_env)
|
||||
|
||||
pg_bin.run_capture(
|
||||
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
|
||||
@@ -471,6 +474,14 @@ HISTORIC_DATA_SETS = [
|
||||
PgVersion.V16,
|
||||
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
|
||||
),
|
||||
# This dataset created on a pageserver running modern code at time of capture, but configured with no generation. This
|
||||
# is our regression test that we can load data written without generations in layer file names & indices
|
||||
HistoricDataSet(
|
||||
"2025-02-07-nogenerations",
|
||||
TenantId("e1411ca6562d6ff62419f693a5695d67"),
|
||||
PgVersion.V17,
|
||||
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -95,6 +95,8 @@ def test_remote_extensions(
|
||||
|
||||
# mock remote_extensions spec
|
||||
spec: dict[str, Any] = {
|
||||
"public_extensions": ["anon"],
|
||||
"custom_extensions": None,
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
},
|
||||
|
||||
@@ -20,6 +20,9 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="We won't create future layers any more after https://github.com/neondatabase/neon/pull/10548"
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"attach_mode",
|
||||
["default_generation", "same_generation"],
|
||||
@@ -172,7 +175,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
|
||||
# force removal of layers from the future
|
||||
tenant_conf = ps_http.tenant_config(tenant_id)
|
||||
generation_before_detach = get_generation_number()
|
||||
env.pageserver.tenant_detach(tenant_id)
|
||||
env.pageserver.http_client().tenant_detach(tenant_id)
|
||||
failpoint_deletion_queue = "deletion-queue-before-execute-pause"
|
||||
|
||||
ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))
|
||||
|
||||
@@ -12,7 +12,6 @@ of the pageserver are:
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from enum import StrEnum
|
||||
|
||||
@@ -29,7 +28,6 @@ from fixtures.pageserver.common_types import parse_layer_file_name
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
list_prefix,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
@@ -124,109 +122,6 @@ def assert_deletion_queue(ps_http, size_fn) -> None:
|
||||
assert size_fn(v) is True
|
||||
|
||||
|
||||
def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Validate behavior when a pageserver is run without generation support enabled,
|
||||
then started again after activating it:
|
||||
- Before upgrade, no objects should have generation suffixes
|
||||
- After upgrade, the bucket should contain a mixture.
|
||||
- In both cases, postgres I/O should work.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.broker.start()
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
env.storage_controller.start()
|
||||
|
||||
# We will start a pageserver with no control_plane_api set, so it won't be able to self-register
|
||||
env.storage_controller.node_register(env.pageserver)
|
||||
|
||||
def remove_control_plane_api_field(config):
|
||||
return config.pop("control_plane_api")
|
||||
|
||||
control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field)
|
||||
env.pageserver.start()
|
||||
env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
|
||||
|
||||
env.create_tenant(
|
||||
tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
|
||||
)
|
||||
|
||||
generate_uploads_and_deletions(env, pageserver=env.pageserver)
|
||||
|
||||
def parse_generation_suffix(key):
|
||||
m = re.match(".+-([0-9a-zA-Z]{8})$", key)
|
||||
if m is None:
|
||||
return None
|
||||
else:
|
||||
log.info(f"match: {m}")
|
||||
log.info(f"group: {m.group(1)}")
|
||||
return int(m.group(1), 16)
|
||||
|
||||
assert neon_env_builder.pageserver_remote_storage is not None
|
||||
pre_upgrade_keys = list(
|
||||
[
|
||||
o["Key"]
|
||||
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
|
||||
"Contents"
|
||||
]
|
||||
]
|
||||
)
|
||||
for key in pre_upgrade_keys:
|
||||
assert parse_generation_suffix(key) is None
|
||||
|
||||
env.pageserver.stop()
|
||||
# Starting without the override that disabled control_plane_api
|
||||
env.pageserver.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"control_plane_api": control_plane_api,
|
||||
}
|
||||
)
|
||||
env.pageserver.start()
|
||||
|
||||
generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
|
||||
|
||||
legacy_objects: list[str] = []
|
||||
suffixed_objects = []
|
||||
post_upgrade_keys = list(
|
||||
[
|
||||
o["Key"]
|
||||
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
|
||||
"Contents"
|
||||
]
|
||||
]
|
||||
)
|
||||
for key in post_upgrade_keys:
|
||||
log.info(f"post-upgrade key: {key}")
|
||||
if parse_generation_suffix(key) is not None:
|
||||
suffixed_objects.append(key)
|
||||
else:
|
||||
legacy_objects.append(key)
|
||||
|
||||
# Bucket now contains a mixture of suffixed and non-suffixed objects
|
||||
assert len(suffixed_objects) > 0
|
||||
assert len(legacy_objects) > 0
|
||||
|
||||
# Flush through deletions to get a clean state for scrub: we are implicitly validating
|
||||
# that our generations-enabled pageserver was able to do deletions of layers
|
||||
# from earlier which don't have a generation.
|
||||
env.pageserver.http_client().deletion_queue_flush(execute=True)
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
|
||||
|
||||
# Having written a mixture of generation-aware and legacy index_part.json,
|
||||
# ensure the scrubber handles the situation as expected.
|
||||
healthy, metadata_summary = env.storage_scrubber.scan_metadata()
|
||||
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
|
||||
assert metadata_summary["timeline_count"] == 1
|
||||
assert metadata_summary["timeline_shard_count"] == 1
|
||||
assert healthy
|
||||
|
||||
|
||||
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
|
||||
@@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
workload.write_rows(256, env.pageservers[0].id)
|
||||
env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
|
||||
|
||||
def validate_heatmap(heatmap):
|
||||
def validate_heatmap(heatmap, on_disk_heatmap):
|
||||
assert len(heatmap["timelines"]) == 1
|
||||
assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
|
||||
assert len(heatmap["timelines"][0]["layers"]) > 0
|
||||
@@ -452,10 +452,13 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
# Each layer appears at most once
|
||||
assert len(set(layer["name"] for layer in layers)) == len(layers)
|
||||
|
||||
assert heatmap == on_disk_heatmap
|
||||
|
||||
# Download and inspect the heatmap that the pageserver uploaded
|
||||
heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id)
|
||||
log.info(f"Read back heatmap: {heatmap_first}")
|
||||
validate_heatmap(heatmap_first)
|
||||
validate_heatmap(heatmap_first, heatmap_first_on_disk)
|
||||
|
||||
# Do some more I/O to generate more layers
|
||||
workload.churn_rows(64, env.pageservers[0].id)
|
||||
@@ -463,9 +466,10 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Ensure that another heatmap upload includes the new layers
|
||||
heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id)
|
||||
log.info(f"Read back heatmap: {heatmap_second}")
|
||||
assert heatmap_second != heatmap_first
|
||||
validate_heatmap(heatmap_second)
|
||||
validate_heatmap(heatmap_second, heatmap_second_on_disk)
|
||||
|
||||
|
||||
def list_elegible_layers(
|
||||
|
||||
@@ -120,7 +120,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
|
||||
|
||||
# Run the main PostgreSQL regression tests, in src/test/regress.
|
||||
#
|
||||
@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds
|
||||
@pytest.mark.timeout(3000) # Contains many sub-tests, is slow in debug builds
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_pg_regress(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -194,7 +194,7 @@ def test_pg_regress(
|
||||
|
||||
# Run the PostgreSQL "isolation" tests, in src/test/isolation.
|
||||
#
|
||||
@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds
|
||||
@pytest.mark.timeout(1500) # Contains many sub-tests, is slow in debug builds
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_isolation(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -222,6 +222,8 @@ def test_isolation(
|
||||
"max_prepared_transactions=100",
|
||||
# Enable the test mode, so that we don't need to patch the test cases.
|
||||
"neon.regress_test_mode = true",
|
||||
# Stack size should be increased for tests to pass with asan.
|
||||
"max_stack_depth = 4MB",
|
||||
],
|
||||
)
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
@@ -417,7 +419,7 @@ def test_tx_abort_with_many_relations(
|
||||
try:
|
||||
# Rollback phase should be fast: this is one WAL record that we should process efficiently
|
||||
fut = exec.submit(rollback_and_wait)
|
||||
fut.result(timeout=5)
|
||||
fut.result(timeout=15)
|
||||
except:
|
||||
exec.shutdown(wait=False, cancel_futures=True)
|
||||
raise
|
||||
|
||||
@@ -57,7 +57,7 @@ def test_proxy_select_1(static_proxy: NeonProxy):
|
||||
assert out[0][0] == 1
|
||||
|
||||
# with SNI
|
||||
out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me")
|
||||
out = static_proxy.safe_psql("select 42", host="generic-project-name.local.neon.build")
|
||||
assert out[0][0] == 42
|
||||
|
||||
|
||||
@@ -234,7 +234,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy):
|
||||
|
||||
connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
|
||||
response = requests.post(
|
||||
f"https://api.localtest.me:{static_proxy.external_http_port}/sql",
|
||||
f"https://api.local.neon.build:{static_proxy.external_http_port}/sql",
|
||||
data=json.dumps({"query": "select 42 as answer", "params": []}),
|
||||
headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
|
||||
verify=str(static_proxy.test_output_dir / "proxy.crt"),
|
||||
|
||||
@@ -35,7 +35,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
|
||||
check_cannot_connect(query="select 1", sslsni=0, options="endpoint=private-project")
|
||||
|
||||
# with SNI
|
||||
check_cannot_connect(query="select 1", host="private-project.localtest.me")
|
||||
check_cannot_connect(query="select 1", host="private-project.local.neon.build")
|
||||
|
||||
# no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
|
||||
out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project")
|
||||
@@ -46,7 +46,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
|
||||
assert out[0][0] == 1
|
||||
|
||||
# with SNI
|
||||
out = static_proxy.safe_psql(query="select 1", host="generic-project.localtest.me")
|
||||
out = static_proxy.safe_psql(query="select 1", host="generic-project.local.neon.build")
|
||||
assert out[0][0] == 1
|
||||
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ def test_pg_sni_router(
|
||||
test_output_dir: Path,
|
||||
):
|
||||
generate_tls_cert(
|
||||
"endpoint.namespace.localtest.me",
|
||||
"endpoint.namespace.local.neon.build",
|
||||
test_output_dir / "router.crt",
|
||||
test_output_dir / "router.key",
|
||||
)
|
||||
@@ -130,7 +130,7 @@ def test_pg_sni_router(
|
||||
with PgSniRouter(
|
||||
neon_binpath=neon_binpath,
|
||||
port=router_port,
|
||||
destination="localtest.me",
|
||||
destination="local.neon.build",
|
||||
tls_cert=test_output_dir / "router.crt",
|
||||
tls_key=test_output_dir / "router.key",
|
||||
test_output_dir=test_output_dir,
|
||||
@@ -141,7 +141,7 @@ def test_pg_sni_router(
|
||||
"select 1",
|
||||
dbname="postgres",
|
||||
sslmode="require",
|
||||
host=f"endpoint--namespace--{pg_port}.localtest.me",
|
||||
host=f"endpoint--namespace--{pg_port}.local.neon.build",
|
||||
hostaddr="127.0.0.1",
|
||||
)
|
||||
assert out[0][0] == 1
|
||||
|
||||
@@ -3,12 +3,14 @@ from __future__ import annotations
|
||||
import threading
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
# This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates.
|
||||
# It requires tracking information about replication origins at page server side
|
||||
@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM
|
||||
def test_subscriber_restart(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
env.create_branch("publisher")
|
||||
|
||||
@@ -554,8 +554,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
log.info(f"Timeline {state.timeline_id} is still active")
|
||||
shutdown.wait(0.5)
|
||||
elif state.timeline_id in offloaded_ids:
|
||||
log.info(f"Timeline {state.timeline_id} is now offloaded")
|
||||
state.offloaded = True
|
||||
log.info(f"Timeline {state.timeline_id} is now offloaded in memory")
|
||||
|
||||
# Hack: when we see something offloaded in the API, it doesn't guarantee that the offload
|
||||
# is persistent (it is marked offloaded first, then that is persisted to the tenant manifest).
|
||||
# So we wait until we see the manifest update before considering it offloaded, that way
|
||||
# subsequent checks that it doesn't revert to active on a restart will pass reliably.
|
||||
time.sleep(0.1)
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||
manifest = env.pageserver_remote_storage.download_tenant_manifest(
|
||||
tenant_id
|
||||
)
|
||||
if manifest is None:
|
||||
log.info(
|
||||
f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)"
|
||||
)
|
||||
elif str(state.timeline_id) in [
|
||||
t["timeline_id"] for t in manifest["offloaded_timelines"]
|
||||
]:
|
||||
log.info(
|
||||
f"Timeline {state.timeline_id} is now offloaded persistently"
|
||||
)
|
||||
state.offloaded = True
|
||||
else:
|
||||
log.info(
|
||||
f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})"
|
||||
)
|
||||
|
||||
break
|
||||
else:
|
||||
# Timeline is neither offloaded nor active, this is unexpected: the pageserver
|
||||
|
||||
@@ -13,12 +13,12 @@
|
||||
# postgres -D data -p3000
|
||||
#
|
||||
# ## Launch proxy with WSS enabled:
|
||||
# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me'
|
||||
# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.local.neon.build'
|
||||
# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres
|
||||
#
|
||||
# ## Launch the tunnel:
|
||||
#
|
||||
# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me"
|
||||
# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.local.neon.build"
|
||||
#
|
||||
# ## Now you can connect with psql:
|
||||
# psql "postgresql://heikki@localhost:40433/postgres"
|
||||
|
||||
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 3cf7ce1afa...13cf5d06c9
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: f0ffc8279d...4c45d78ad5
4
vendor/revisions.json
vendored
4
vendor/revisions.json
vendored
@@ -1,11 +1,11 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.2",
|
||||
"f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
|
||||
"4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d"
|
||||
],
|
||||
"v16": [
|
||||
"16.6",
|
||||
"3cf7ce1afab75027716d14223f95ddb300754162"
|
||||
"13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792"
|
||||
],
|
||||
"v15": [
|
||||
"15.10",
|
||||
|
||||
@@ -92,6 +92,7 @@ tonic = { version = "0.12", default-features = false, features = ["codegen", "pr
|
||||
tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
|
||||
tracing = { version = "0.1", features = ["log"] }
|
||||
tracing-core = { version = "0.1" }
|
||||
tracing-log = { version = "0.2" }
|
||||
url = { version = "2", features = ["serde"] }
|
||||
zerocopy = { version = "0.7", features = ["derive", "simd"] }
|
||||
zeroize = { version = "1", features = ["derive", "serde"] }
|
||||
|
||||
Reference in New Issue
Block a user