mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 01:50:38 +00:00
Compare commits
1 Commits
vlad/get-v
...
problame/r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4707d2df6d |
@@ -59,7 +59,7 @@ runs:
|
||||
BUCKET: neon-github-public-dev
|
||||
|
||||
# TODO: We can replace with a special docker image with Java and Allure pre-installed
|
||||
- uses: actions/setup-java@v4
|
||||
- uses: actions/setup-java@v3
|
||||
with:
|
||||
distribution: 'temurin'
|
||||
java-version: '17'
|
||||
@@ -76,8 +76,8 @@ runs:
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.27.0
|
||||
ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
|
||||
ALLURE_VERSION: 2.24.0
|
||||
ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
|
||||
|
||||
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
|
||||
- name: Acquire lock
|
||||
@@ -180,7 +180,7 @@ runs:
|
||||
fi
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -215,7 +215,7 @@ runs:
|
||||
rm -rf ${WORKDIR}
|
||||
fi
|
||||
|
||||
- uses: actions/github-script@v7
|
||||
- uses: actions/github-script@v6
|
||||
if: always()
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
|
||||
@@ -80,13 +80,13 @@ runs:
|
||||
|
||||
- name: Checkout
|
||||
if: inputs.needs_postgres_source == 'true'
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
2
.github/workflows/approved-for-ci-run.yml
vendored
2
.github/workflows/approved-for-ci-run.yml
vendored
@@ -64,7 +64,7 @@ jobs:
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: main
|
||||
token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
10
.github/workflows/benchmarking.yml
vendored
10
.github/workflows/benchmarking.yml
vendored
@@ -66,7 +66,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -221,7 +221,7 @@ jobs:
|
||||
timeout-minutes: 480
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -366,7 +366,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -465,7 +465,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -562,7 +562,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
|
||||
51
.github/workflows/build_and_test.yml
vendored
51
.github/workflows/build_and_test.yml
vendored
@@ -69,7 +69,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -106,13 +106,13 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -146,7 +146,7 @@ jobs:
|
||||
# Disabled for now
|
||||
# - name: Restore cargo deps cache
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v4
|
||||
# uses: actions/cache@v3
|
||||
# with:
|
||||
# path: |
|
||||
# !~/.cargo/registry/src
|
||||
@@ -231,7 +231,7 @@ jobs:
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -303,7 +303,7 @@ jobs:
|
||||
# compressed crates.
|
||||
# - name: Cache cargo deps
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v4
|
||||
# uses: actions/cache@v3
|
||||
# with:
|
||||
# path: |
|
||||
# ~/.cargo/registry/
|
||||
@@ -317,21 +317,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -451,7 +451,7 @@ jobs:
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -472,14 +472,9 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
get-benchmarks-durations:
|
||||
@@ -493,10 +488,10 @@ jobs:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -530,7 +525,7 @@ jobs:
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -559,7 +554,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -570,7 +565,7 @@ jobs:
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v7
|
||||
- uses: actions/github-script@v6
|
||||
if: ${{ !cancelled() }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
@@ -610,7 +605,7 @@ jobs:
|
||||
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
@@ -679,7 +674,7 @@ jobs:
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
|
||||
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/github-script@v7
|
||||
- uses: actions/github-script@v6
|
||||
env:
|
||||
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
@@ -905,7 +900,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -1119,7 +1114,7 @@ jobs:
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 0
|
||||
@@ -1142,7 +1137,7 @@ jobs:
|
||||
|
||||
- name: Create git tag
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v7
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
@@ -1156,7 +1151,7 @@ jobs:
|
||||
|
||||
- name: Create GitHub release
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v7
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
|
||||
16
.github/workflows/neon_extra_builds.yml
vendored
16
.github/workflows/neon_extra_builds.yml
vendored
@@ -57,21 +57,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
@@ -172,21 +172,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -356,7 +356,7 @@ jobs:
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Publish build stats report
|
||||
uses: actions/github-script@v7
|
||||
uses: actions/github-script@v6
|
||||
env:
|
||||
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
|
||||
6
.github/workflows/pg_clients.yml
vendored
6
.github/workflows/pg_clients.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
# It will be fixed after switching to gen2 runner
|
||||
- name: Upload python test logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
retention-days: 7
|
||||
name: python-test-pg_clients-${{ runner.os }}-stage-logs
|
||||
|
||||
5
.github/workflows/trigger-e2e-tests.yml
vendored
5
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -9,7 +9,7 @@ on:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
|
||||
env:
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -115,3 +115,4 @@ jobs:
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
28
Cargo.lock
generated
28
Cargo.lock
generated
@@ -286,7 +286,6 @@ dependencies = [
|
||||
"git-version",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
@@ -1157,6 +1156,16 @@ version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
|
||||
|
||||
[[package]]
|
||||
name = "close_fds"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.0"
|
||||
@@ -1814,7 +1823,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
|
||||
dependencies = [
|
||||
"enumset_derive",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2759,17 +2767,6 @@ version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "leaky-bucket"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
|
||||
dependencies = [
|
||||
"parking_lot 0.12.1",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.150"
|
||||
@@ -3461,7 +3458,6 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -3471,6 +3467,7 @@ dependencies = [
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"close_fds",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
"crc32c",
|
||||
@@ -3489,7 +3486,6 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"leaky-bucket",
|
||||
"md5",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -3552,7 +3548,6 @@ dependencies = [
|
||||
"enum-map",
|
||||
"hex",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
@@ -6363,7 +6358,6 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"leaky-bucket",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
|
||||
@@ -66,6 +66,7 @@ camino = "1.1.6"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
close_fds = "0.3.2"
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
@@ -97,7 +98,6 @@ ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
|
||||
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
&& mold -run cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pagectl \
|
||||
|
||||
@@ -769,24 +769,6 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_ivm"
|
||||
# compile pg_ivm extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-ivm-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
|
||||
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -828,7 +810,6 @@ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
|
||||
16
Makefile
16
Makefile
@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
|
||||
|
||||
.PHONY: neon-pg-clean-ext-%
|
||||
neon-pg-clean-ext-%:
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
@@ -216,11 +216,11 @@ neon-pg-ext: \
|
||||
neon-pg-ext-v15 \
|
||||
neon-pg-ext-v16
|
||||
|
||||
.PHONY: neon-pg-clean-ext
|
||||
neon-pg-clean-ext: \
|
||||
neon-pg-clean-ext-v14 \
|
||||
neon-pg-clean-ext-v15 \
|
||||
neon-pg-clean-ext-v16
|
||||
.PHONY: neon-pg-ext-clean
|
||||
neon-pg-ext-clean: \
|
||||
neon-pg-ext-clean-v14 \
|
||||
neon-pg-ext-clean-v15 \
|
||||
neon-pg-ext-clean-v16
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
@@ -249,7 +249,7 @@ postgres-check: \
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean: postgres-clean neon-pg-clean-ext
|
||||
clean: postgres-clean neon-pg-ext-clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
|
||||
10
README.md
10
README.md
@@ -249,16 +249,6 @@ testing locally, it is convenient to run just one set of permutations, like this
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
You may find yourself in need of flamegraphs for software in this repository.
|
||||
You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
|
||||
|
||||
>[!IMPORTANT]
|
||||
> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
|
||||
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
|
||||
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
|
||||
|
||||
## Documentation
|
||||
|
||||
[docs](/docs) Contains a top-level overview of all available markdown documentation.
|
||||
|
||||
@@ -45,6 +45,7 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
@@ -52,9 +53,7 @@ use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{
|
||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||
};
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version;
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
@@ -395,15 +394,6 @@ fn main() -> Result<()> {
|
||||
info!("synced safekeepers at lsn {lsn}");
|
||||
}
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::TerminationPending {
|
||||
state.status = ComputeStatus::Terminated;
|
||||
compute.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = true
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if let Err(err) = compute.check_for_core_dumps() {
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
@@ -533,7 +523,16 @@ fn cli() -> clap::Command {
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
forward_termination_signal();
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
kill(pg_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -28,8 +28,6 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
|
||||
use crate::checker::create_availability_check_data;
|
||||
@@ -326,8 +324,7 @@ impl ComputeNode {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
// Note: this overrides any password set in the connection string.
|
||||
@@ -1324,17 +1321,3 @@ LIMIT 100",
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
|
||||
kill(pg_pid, Signal::SIGQUIT).ok();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,9 +51,6 @@ pub fn write_postgres_conf(
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
writeln!(
|
||||
file,
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
@@ -124,17 +123,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/terminate") => {
|
||||
info!("serving /terminate POST request");
|
||||
match handle_terminate_request(compute).await {
|
||||
Ok(()) => Response::new(Body::empty()),
|
||||
Err((msg, code)) => {
|
||||
error!("error handling /terminate request: {msg}");
|
||||
render_json_error(&msg, code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
@@ -309,49 +297,6 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return Ok(());
|
||||
}
|
||||
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for termination request: {:?}",
|
||||
state.status.clone()
|
||||
);
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.status = ComputeStatus::TerminationPending;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
}
|
||||
forward_termination_signal();
|
||||
info!("sent signal and notified waiters");
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Terminated.
|
||||
// This is needed to do not block the main pool of workers and
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Terminated {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become Terminated, current status: {:?}",
|
||||
state.status
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
info!("terminated Postgres");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, state: Arc<ComputeNode>) {
|
||||
|
||||
@@ -168,29 +168,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/terminate:
|
||||
post:
|
||||
tags:
|
||||
- Terminate
|
||||
summary: Terminate Postgres and wait for it to exit
|
||||
description: ""
|
||||
operationId: terminate
|
||||
responses:
|
||||
200:
|
||||
description: Result
|
||||
412:
|
||||
description: "wrong state"
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: "Unexpected error"
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
|
||||
@@ -4,11 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs and behaviors
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
aws-config.workspace = true
|
||||
@@ -18,7 +13,6 @@ clap.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hyper.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
|
||||
use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
|
||||
self.shards
|
||||
.sort_by_key(|(shard, _node_id)| shard.shard_number);
|
||||
|
||||
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
|
||||
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
|
||||
// We have pageservers for all the shards: emit a configuration update
|
||||
return Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
self.shards.len(),
|
||||
shard_count.count()
|
||||
shard_count.0
|
||||
);
|
||||
}
|
||||
|
||||
@@ -155,7 +155,7 @@ impl ComputeHook {
|
||||
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
||||
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint.reconfigure(compute_pageservers.clone()).await?;
|
||||
}
|
||||
}
|
||||
@@ -177,7 +177,7 @@ impl ComputeHook {
|
||||
req
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
tracing::debug!(
|
||||
"Sending notify request to {} ({:?})",
|
||||
url,
|
||||
reconfigure_request
|
||||
@@ -266,7 +266,7 @@ impl ComputeHook {
|
||||
/// periods, but we don't retry forever. The **caller** is responsible for handling failures and
|
||||
/// ensuring that they eventually call again to ensure that the compute is eventually notified of
|
||||
/// the proper pageserver nodes for a tenant.
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
|
||||
#[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
|
||||
pub(super) async fn notify(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -298,7 +298,7 @@ impl ComputeHook {
|
||||
let Some(reconfigure_request) = reconfigure_request else {
|
||||
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||
// until it does.
|
||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||
tracing::debug!("Tenant isn't yet ready to emit a notification",);
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
|
||||
@@ -114,10 +114,7 @@ async fn handle_tenant_create(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
service.tenant_create(create_req).await?,
|
||||
)
|
||||
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
|
||||
}
|
||||
|
||||
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
@@ -199,7 +196,7 @@ async fn handle_tenant_timeline_create(
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?,
|
||||
@@ -336,22 +333,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
state.service.tenants_dump()
|
||||
}
|
||||
|
||||
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
state.service.scheduler_dump()
|
||||
}
|
||||
|
||||
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -440,13 +421,6 @@ pub fn make_router(
|
||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||
request_span(r, handle_node_drop)
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
request_span(r, handle_scheduler_dump)
|
||||
})
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
|
||||
@@ -3,7 +3,6 @@ use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod compute_hook;
|
||||
pub mod http;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
@@ -12,7 +11,7 @@ mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
enum PlacementPolicy {
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
@@ -23,7 +22,7 @@ enum PlacementPolicy {
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
struct Sequence(u64);
|
||||
|
||||
impl Sequence {
|
||||
@@ -38,12 +37,6 @@ impl std::fmt::Display for Sequence {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Sequence {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicCounter<Sequence> for Sequence {
|
||||
fn cnt_advance(&mut self, v: Sequence) {
|
||||
assert!(*self <= v);
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
///
|
||||
use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use aws_config::{self, BehaviorVersion, Region};
|
||||
@@ -16,7 +15,6 @@ use diesel::Connection;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
@@ -206,8 +204,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
preinitialize_metrics();
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
|
||||
@@ -241,23 +237,15 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
let auth = secrets
|
||||
.public_key
|
||||
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
|
||||
let router = make_router(service.clone(), auth)
|
||||
let router = make_router(service, auth)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
|
||||
|
||||
// Start HTTP server
|
||||
let server_shutdown = CancellationToken::new();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(router_service)
|
||||
.with_graceful_shutdown({
|
||||
let server_shutdown = server_shutdown.clone();
|
||||
async move {
|
||||
server_shutdown.cancelled().await;
|
||||
}
|
||||
});
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
let server_task = tokio::task::spawn(server);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
// Wait until we receive a signal
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||
@@ -278,16 +266,5 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Stop HTTP server first, so that we don't have to service requests
|
||||
// while shutting down Service
|
||||
server_shutdown.cancel();
|
||||
if let Err(e) = server_task.await {
|
||||
tracing::error!("Error joining HTTP server task: {e}")
|
||||
}
|
||||
tracing::info!("Joined HTTP server task");
|
||||
|
||||
service.shutdown().await;
|
||||
tracing::info!("Service shutdown complete");
|
||||
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) struct ReconcilerMetrics {
|
||||
pub(crate) spawned: IntCounter,
|
||||
pub(crate) complete: IntCounterVec,
|
||||
}
|
||||
|
||||
impl ReconcilerMetrics {
|
||||
// Labels used on [`Self::complete`]
|
||||
pub(crate) const SUCCESS: &'static str = "ok";
|
||||
pub(crate) const ERROR: &'static str = "success";
|
||||
pub(crate) const CANCEL: &'static str = "cancel";
|
||||
}
|
||||
|
||||
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
|
||||
spawned: register_int_counter!(
|
||||
"storage_controller_reconcile_spawn",
|
||||
"Count of how many times we spawn a reconcile task",
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
complete: register_int_counter_vec!(
|
||||
"storage_controller_reconcile_complete",
|
||||
"Reconciler tasks completed, broken down by success/failure/cancelled",
|
||||
&["status"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
Lazy::force(&RECONCILER);
|
||||
}
|
||||
@@ -1,16 +1,9 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use serde::Serialize;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
/// Scheduling statistics are maintened separately in [`crate::scheduler`].
|
||||
///
|
||||
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
|
||||
/// implementation of serialization on this type is only for debug dumps.
|
||||
#[derive(Clone, Serialize, Eq, PartialEq)]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
|
||||
|
||||
@@ -222,7 +222,7 @@ impl Persistence {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
tenants_map.insert(tenant_shard_id, tsp);
|
||||
@@ -318,7 +318,7 @@ impl Persistence {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
}
|
||||
@@ -340,7 +340,7 @@ impl Persistence {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation.eq(generation + 1),
|
||||
generation_pageserver.eq(node_id.0 as i64),
|
||||
@@ -362,7 +362,7 @@ impl Persistence {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
@@ -392,19 +392,21 @@ impl Persistence {
|
||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||
// Mark parent shards as splitting
|
||||
|
||||
let expect_parent_records = std::cmp::max(1, old_shard_count.0);
|
||||
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||
.set((splitting.eq(1),))
|
||||
.execute(conn)?;
|
||||
if u8::try_from(updated)
|
||||
.map_err(|_| DatabaseError::Logical(
|
||||
format!("Overflow existing shard count {} while splitting", updated))
|
||||
)? != old_shard_count.count() {
|
||||
)? != expect_parent_records {
|
||||
// Perhaps a deletion or another split raced with this attempt to split, mutating
|
||||
// the parent shards that we intend to split. In this case the split request should fail.
|
||||
return Err(DatabaseError::Logical(
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
|
||||
));
|
||||
}
|
||||
|
||||
@@ -416,7 +418,7 @@ impl Persistence {
|
||||
let mut parent = crate::schema::tenant_shards::table
|
||||
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
|
||||
.load::<TenantShardPersistence>(conn)?;
|
||||
let parent = if parent.len() != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
@@ -457,7 +459,7 @@ impl Persistence {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
// Clear sharding flag
|
||||
@@ -477,7 +479,7 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
|
||||
@@ -13,7 +13,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
@@ -27,7 +26,7 @@ pub(super) struct Reconciler {
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) intent: TargetState,
|
||||
pub(crate) intent: IntentState,
|
||||
pub(crate) config: TenantConfig,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
@@ -54,46 +53,14 @@ pub(super) struct Reconciler {
|
||||
/// the tenant is changed.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
|
||||
/// Reconcilers are registered with a Gate so that during a graceful shutdown we
|
||||
/// can wait for all the reconcilers to respond to their cancellation tokens.
|
||||
pub(crate) _gate_guard: GateGuard,
|
||||
|
||||
/// Access to persistent storage for updating generation numbers
|
||||
pub(crate) persistence: Arc<Persistence>,
|
||||
}
|
||||
|
||||
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
|
||||
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
|
||||
/// and the TargetState is just the instruction for a particular Reconciler run.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TargetState {
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
impl TargetState {
|
||||
pub(crate) fn from_intent(intent: &IntentState) -> Self {
|
||||
Self {
|
||||
attached: *intent.get_attached(),
|
||||
secondary: intent.get_secondary().clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = self.secondary.clone();
|
||||
if let Some(node_id) = &self.attached {
|
||||
result.push(*node_id);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum ReconcileError {
|
||||
#[error(transparent)]
|
||||
Notify(#[from] NotifyError),
|
||||
#[error("Cancelled")]
|
||||
Cancel,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -296,7 +263,7 @@ impl Reconciler {
|
||||
secondary_conf,
|
||||
tenant_conf: config.clone(),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
}
|
||||
}
|
||||
@@ -491,7 +458,7 @@ impl Reconciler {
|
||||
generation: None,
|
||||
secondary_conf: None,
|
||||
shard_number: self.shard.number.0,
|
||||
shard_count: self.shard.count.literal(),
|
||||
shard_count: self.shard.count.0,
|
||||
shard_stripe_size: self.shard.stripe_size.0,
|
||||
tenant_conf: self.config.clone(),
|
||||
},
|
||||
@@ -499,9 +466,6 @@ impl Reconciler {
|
||||
}
|
||||
|
||||
for (node_id, conf) in changes {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(ReconcileError::Cancel);
|
||||
}
|
||||
self.location_config(node_id, conf, None).await?;
|
||||
}
|
||||
|
||||
@@ -542,7 +506,7 @@ pub(crate) fn attached_location_conf(
|
||||
generation: generation.into(),
|
||||
secondary_conf: None,
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
@@ -557,7 +521,7 @@ pub(crate) fn secondary_location_conf(
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_count: shard.count.0,
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
|
||||
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ScheduleError {
|
||||
@@ -18,179 +19,52 @@ impl From<ScheduleError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Eq, PartialEq)]
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||
shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
may_schedule: bool,
|
||||
}
|
||||
|
||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||
/// on which to run.
|
||||
///
|
||||
/// The type has no persistent state of its own: this is all populated at startup. The Serialize
|
||||
/// impl is only for debug dumps.
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct Scheduler {
|
||||
nodes: HashMap<NodeId, SchedulerNode>,
|
||||
tenant_counts: HashMap<NodeId, usize>,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
for node in nodes {
|
||||
scheduler_nodes.insert(
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
pub(crate) fn new(
|
||||
tenants: &BTreeMap<TenantShardId, TenantState>,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
) -> Self {
|
||||
let mut tenant_counts = HashMap::new();
|
||||
for node_id in nodes.keys() {
|
||||
tenant_counts.insert(*node_id, 0);
|
||||
}
|
||||
|
||||
Self {
|
||||
nodes: scheduler_nodes,
|
||||
}
|
||||
}
|
||||
|
||||
/// For debug/support: check that our internal statistics are in sync with the state of
|
||||
/// the nodes & tenant shards.
|
||||
///
|
||||
/// If anything is inconsistent, log details and return an error.
|
||||
pub(crate) fn consistency_check<'a>(
|
||||
&self,
|
||||
nodes: impl Iterator<Item = &'a Node>,
|
||||
shards: impl Iterator<Item = &'a TenantState>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
||||
for node in nodes {
|
||||
expect_nodes.insert(
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
for shard in shards {
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
node_id
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in shard.intent.get_secondary() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
node_id
|
||||
),
|
||||
}
|
||||
for tenant in tenants.values() {
|
||||
if let Some(ps) = tenant.intent.attached {
|
||||
let entry = tenant_counts.entry(ps).or_insert(0);
|
||||
*entry += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (node_id, expect_node) in &expect_nodes {
|
||||
let Some(self_node) = self.nodes.get(node_id) else {
|
||||
anyhow::bail!("Node {node_id} not found in Self")
|
||||
};
|
||||
|
||||
if self_node != expect_node {
|
||||
tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
|
||||
tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
|
||||
tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
|
||||
|
||||
anyhow::bail!("Inconsistent state on {node_id}");
|
||||
for (node_id, node) in nodes {
|
||||
if !node.may_schedule() {
|
||||
tenant_counts.remove(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
if expect_nodes.len() != self.nodes.len() {
|
||||
// We just checked that all the expected nodes are present. If the lengths don't match,
|
||||
// it means that we have nodes in Self that are unexpected.
|
||||
for node_id in self.nodes.keys() {
|
||||
if !expect_nodes.contains_key(node_id) {
|
||||
anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Increment the reference count of a node. This reference count is used to guide scheduling
|
||||
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
|
||||
/// this node.
|
||||
///
|
||||
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
||||
/// [`Self::new`] or [`Self::node_upsert`])
|
||||
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
debug_assert!(false);
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count += 1;
|
||||
}
|
||||
|
||||
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
|
||||
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count -= 1;
|
||||
}
|
||||
|
||||
pub(crate) fn node_upsert(&mut self, node: &Node) {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
match self.nodes.entry(node.id) {
|
||||
Occupied(mut entry) => {
|
||||
entry.get_mut().may_schedule = node.may_schedule();
|
||||
}
|
||||
Vacant(entry) => {
|
||||
entry.insert(SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn node_remove(&mut self, node_id: NodeId) {
|
||||
if self.nodes.remove(&node_id).is_none() {
|
||||
tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
|
||||
}
|
||||
Self { tenant_counts }
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
if self.tenant_counts.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.nodes
|
||||
.tenant_counts
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || !v.may_schedule {
|
||||
if hard_exclude.contains(k) {
|
||||
None
|
||||
} else {
|
||||
Some((*k, v.shard_count))
|
||||
Some((*k, *v))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
@@ -199,108 +73,17 @@ impl Scheduler {
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left. We log some detail about
|
||||
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
|
||||
// After applying constraints, no pageservers were left
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
for (node_id, count) in &tenant_counts {
|
||||
tracing::info!("tenant_counts[{node_id}]={count}");
|
||||
}
|
||||
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
// is IntentState's job when the scheduled location is used.
|
||||
|
||||
tracing::info!("scheduler selected node {node_id}");
|
||||
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
|
||||
Ok(node_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{node::Node, tenant_state::IntentState};
|
||||
|
||||
#[test]
|
||||
fn scheduler_basic() -> anyhow::Result<()> {
|
||||
let mut nodes = HashMap::new();
|
||||
nodes.insert(
|
||||
NodeId(1),
|
||||
Node {
|
||||
id: NodeId(1),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: String::new(),
|
||||
listen_http_port: 0,
|
||||
listen_pg_addr: String::new(),
|
||||
listen_pg_port: 0,
|
||||
},
|
||||
);
|
||||
|
||||
nodes.insert(
|
||||
NodeId(2),
|
||||
Node {
|
||||
id: NodeId(2),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: String::new(),
|
||||
listen_http_port: 0,
|
||||
listen_pg_addr: String::new(),
|
||||
listen_pg_port: 0,
|
||||
},
|
||||
);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
|
||||
|
||||
t1_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Dropping an IntentState without clearing it causes a panic in debug mode,
|
||||
// because we have failed to properly update scheduler shard counts.
|
||||
let result = std::panic::catch_unwind(move || {
|
||||
drop(t2_intent);
|
||||
});
|
||||
assert!(result.is_err());
|
||||
} else {
|
||||
t2_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,47 +1,27 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use crate::{metrics, persistence::TenantShardPersistence};
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
};
|
||||
use serde::Serialize;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{instrument, Instrument};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::NodeId,
|
||||
seqwait::{SeqWait, SeqWaitError},
|
||||
sync::gate::Gate,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::{split_state::SplitState, Persistence},
|
||||
reconciler::{
|
||||
attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
|
||||
},
|
||||
persistence::Persistence,
|
||||
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
|
||||
scheduler::{ScheduleError, Scheduler},
|
||||
service, PlacementPolicy, Sequence,
|
||||
};
|
||||
|
||||
/// Serialization helper
|
||||
fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
T: Clone + std::fmt::Display,
|
||||
{
|
||||
serializer.collect_str(&v.lock().unwrap())
|
||||
}
|
||||
|
||||
/// In-memory state for a particular tenant shard.
|
||||
///
|
||||
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
|
||||
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct TenantState {
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -76,29 +56,20 @@ pub(crate) struct TenantState {
|
||||
/// If a reconcile task is currently in flight, it may be joined here (it is
|
||||
/// only safe to join if either the result has been received or the reconciler's
|
||||
/// cancellation token has been fired)
|
||||
#[serde(skip)]
|
||||
pub(crate) reconciler: Option<ReconcilerHandle>,
|
||||
|
||||
/// If a tenant is being split, then all shards with that TenantId will have a
|
||||
/// SplitState set, this acts as a guard against other operations such as background
|
||||
/// reconciliation, and timeline creation.
|
||||
pub(crate) splitting: SplitState,
|
||||
|
||||
/// Optionally wait for reconciliation to complete up to a particular
|
||||
/// sequence number.
|
||||
#[serde(skip)]
|
||||
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// Indicates sequence number for which we have encountered an error reconciling. If
|
||||
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
|
||||
/// and callers should stop waiting for `waiter` and propagate the error.
|
||||
#[serde(skip)]
|
||||
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// The most recent error from a reconcile on this tenant
|
||||
/// TODO: generalize to an array of recent events
|
||||
/// TOOD: use a ArcSwap instead of mutex for faster reads?
|
||||
#[serde(serialize_with = "read_mutex_content")]
|
||||
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
|
||||
|
||||
/// If we have a pending compute notification that for some reason we weren't able to send,
|
||||
@@ -108,112 +79,13 @@ pub(crate) struct TenantState {
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
#[derive(Default, Clone, Debug)]
|
||||
pub(crate) struct IntentState {
|
||||
attached: Option<NodeId>,
|
||||
secondary: Vec<NodeId>,
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
impl IntentState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
attached: None,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
|
||||
if let Some(node_id) = node_id {
|
||||
scheduler.node_inc_ref(node_id);
|
||||
}
|
||||
Self {
|
||||
attached: node_id,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
|
||||
if self.attached != new_attached {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
if let Some(new_attached) = &new_attached {
|
||||
scheduler.node_inc_ref(*new_attached);
|
||||
}
|
||||
self.attached = new_attached;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
|
||||
debug_assert!(!self.secondary.contains(&new_secondary));
|
||||
scheduler.node_inc_ref(new_secondary);
|
||||
self.secondary.push(new_secondary);
|
||||
}
|
||||
|
||||
/// It is legal to call this with a node that is not currently a secondary: that is a no-op
|
||||
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
|
||||
let index = self.secondary.iter().position(|n| *n == node_id);
|
||||
if let Some(index) = index {
|
||||
scheduler.node_dec_ref(node_id);
|
||||
self.secondary.remove(index);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
for secondary in self.secondary.drain(..) {
|
||||
scheduler.node_dec_ref(secondary);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
|
||||
self.clear_secondary(scheduler);
|
||||
}
|
||||
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn get_attached(&self) -> &Option<NodeId> {
|
||||
&self.attached
|
||||
}
|
||||
|
||||
pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
|
||||
&self.secondary
|
||||
}
|
||||
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IntentState {
|
||||
fn drop(&mut self) {
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
#[derive(Default, Clone)]
|
||||
pub(crate) struct ObservedState {
|
||||
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
|
||||
}
|
||||
@@ -227,7 +99,7 @@ pub(crate) struct ObservedState {
|
||||
/// what it is (e.g. we failed partway through configuring it)
|
||||
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
|
||||
/// and that configuration will still be present unless something external interfered.
|
||||
#[derive(Clone, Serialize)]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct ObservedStateLocation {
|
||||
/// If None, it means we do not know the status of this shard's location on this node, but
|
||||
/// we know that we might have some state on this node.
|
||||
@@ -303,6 +175,46 @@ pub(crate) struct ReconcileResult {
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
}
|
||||
|
||||
impl IntentState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
attached: None,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn single(node_id: Option<NodeId>) -> Self {
|
||||
Self {
|
||||
attached: node_id,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservedState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
@@ -326,7 +238,6 @@ impl TenantState {
|
||||
observed: ObservedState::default(),
|
||||
config: TenantConfig::default(),
|
||||
reconciler: None,
|
||||
splitting: SplitState::Idle,
|
||||
sequence: Sequence(1),
|
||||
waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
@@ -396,12 +307,12 @@ impl TenantState {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
@@ -409,14 +320,14 @@ impl TenantState {
|
||||
// Should have exactly one attached, and N secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
self.intent.secondary.push(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
@@ -424,12 +335,12 @@ impl TenantState {
|
||||
Detached => {
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.set_attached(scheduler, None);
|
||||
self.intent.attached = None;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
@@ -504,8 +415,6 @@ impl TenantState {
|
||||
false
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
@@ -513,8 +422,6 @@ impl TenantState {
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
persistence: &Arc<Persistence>,
|
||||
gate: &Gate,
|
||||
cancel: &CancellationToken,
|
||||
) -> Option<ReconcilerWaiter> {
|
||||
// If there are any ambiguous observed states, and the nodes they refer to are available,
|
||||
// we should reconcile to clean them up.
|
||||
@@ -536,14 +443,6 @@ impl TenantState {
|
||||
return None;
|
||||
}
|
||||
|
||||
// If we are currently splitting, then never start a reconciler task: the splitting logic
|
||||
// requires that shards are not interfered with while it runs. Do this check here rather than
|
||||
// up top, so that we only log this message if we would otherwise have done a reconciliation.
|
||||
if !matches!(self.splitting, SplitState::Idle) {
|
||||
tracing::info!("Refusing to reconcile, splitting in progress");
|
||||
return None;
|
||||
}
|
||||
|
||||
// Reconcile already in flight for the current sequence?
|
||||
if let Some(handle) = &self.reconciler {
|
||||
if handle.sequence == self.sequence {
|
||||
@@ -561,101 +460,70 @@ impl TenantState {
|
||||
// doing our sequence's work.
|
||||
let old_handle = self.reconciler.take();
|
||||
|
||||
let Ok(gate_guard) = gate.enter() else {
|
||||
// Shutting down, don't start a reconciler
|
||||
return None;
|
||||
};
|
||||
|
||||
let reconciler_cancel = cancel.child_token();
|
||||
let cancel = CancellationToken::new();
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
generation: self.generation,
|
||||
intent: TargetState::from_intent(&self.intent),
|
||||
intent: self.intent.clone(),
|
||||
config: self.config.clone(),
|
||||
observed: self.observed.clone(),
|
||||
pageservers: pageservers.clone(),
|
||||
compute_hook: compute_hook.clone(),
|
||||
service_config: service_config.clone(),
|
||||
_gate_guard: gate_guard,
|
||||
cancel: reconciler_cancel.clone(),
|
||||
cancel: cancel.clone(),
|
||||
persistence: persistence.clone(),
|
||||
compute_notify_failure: false,
|
||||
};
|
||||
|
||||
let reconcile_seq = self.sequence;
|
||||
|
||||
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
|
||||
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
|
||||
let must_notify = self.pending_compute_notification;
|
||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
}
|
||||
let join_handle = tokio::task::spawn(async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
}
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
|
||||
.inc();
|
||||
return;
|
||||
}
|
||||
|
||||
// Attempt to make observed state match intent state
|
||||
let result = reconciler.reconcile().await;
|
||||
|
||||
// If we know we had a pending compute notification from some previous action, send a notification irrespective
|
||||
// of whether the above reconcile() did any work
|
||||
if result.is_ok() && must_notify {
|
||||
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
|
||||
reconciler.compute_notify().await.ok();
|
||||
}
|
||||
|
||||
// Update result counter
|
||||
match &result {
|
||||
Ok(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
|
||||
Err(ReconcileError::Cancel) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
|
||||
Err(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
|
||||
}
|
||||
.inc();
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
pending_compute_notification: reconciler.compute_notify_failure,
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
.instrument(reconciler_span),
|
||||
);
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Attempt to make observed state match intent state
|
||||
let result = reconciler.reconcile().await;
|
||||
|
||||
// If we know we had a pending compute notification from some previous action, send a notification irrespective
|
||||
// of whether the above reconcile() did any work
|
||||
if result.is_ok() && must_notify {
|
||||
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
|
||||
reconciler.compute_notify().await.ok();
|
||||
}
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
pending_compute_notification: reconciler.compute_notify_failure,
|
||||
})
|
||||
.ok();
|
||||
});
|
||||
|
||||
self.reconciler = Some(ReconcilerHandle {
|
||||
sequence: self.sequence,
|
||||
handle: join_handle,
|
||||
cancel: reconciler_cancel,
|
||||
cancel,
|
||||
});
|
||||
|
||||
Some(ReconcilerWaiter {
|
||||
@@ -680,18 +548,4 @@ impl TenantState {
|
||||
|
||||
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
||||
TenantShardPersistence {
|
||||
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
||||
shard_number: self.tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: self.shard.stripe_size.0 as i32,
|
||||
generation: self.generation.into().unwrap_or(0) as i32,
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,7 +113,7 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
@@ -137,7 +137,7 @@ impl FromStr for NodeAvailability {
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
|
||||
@@ -450,7 +450,7 @@ async fn handle_tenant(
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
count: ShardCount::new(shard_count),
|
||||
count: ShardCount(shard_count),
|
||||
stripe_size: shard_stripe_size
|
||||
.map(ShardStripeSize)
|
||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
@@ -652,10 +652,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
let name = import_match
|
||||
.get_one::<String>("node-name")
|
||||
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||
let update_catalog = import_match
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
// Parse base inputs
|
||||
let base_tarfile = import_match
|
||||
@@ -698,7 +694,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
!update_catalog,
|
||||
)?;
|
||||
println!("Done");
|
||||
}
|
||||
@@ -836,10 +831,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.get_one::<String>("endpoint_id")
|
||||
.map(String::to_string)
|
||||
.unwrap_or_else(|| format!("ep-{branch_name}"));
|
||||
let update_catalog = sub_args
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let lsn = sub_args
|
||||
.get_one::<String>("lsn")
|
||||
@@ -889,7 +880,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
!update_catalog,
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
@@ -928,11 +918,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.get(endpoint_id.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
||||
|
||||
let create_test_user = sub_args
|
||||
.get_one::<bool>("create-test-user")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
@@ -987,7 +972,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
pageservers,
|
||||
remote_ext_config,
|
||||
stripe_size.0 as usize,
|
||||
create_test_user,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1473,18 +1457,6 @@ fn cli() -> Command {
|
||||
.required(false)
|
||||
.default_value("1");
|
||||
|
||||
let update_catalog = Arg::new("update-catalog")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("update-catalog")
|
||||
.help("If set, will set up the catalog for neon_superuser")
|
||||
.required(false);
|
||||
|
||||
let create_test_user = Arg::new("create-test-user")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("create-test-user")
|
||||
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1545,7 +1517,6 @@ fn cli() -> Command {
|
||||
.arg(Arg::new("end-lsn").long("end-lsn")
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(update_catalog.clone())
|
||||
)
|
||||
).subcommand(
|
||||
Command::new("tenant")
|
||||
@@ -1659,7 +1630,6 @@ fn cli() -> Command {
|
||||
.required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(hot_standby_arg.clone())
|
||||
.arg(update_catalog)
|
||||
)
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
@@ -1667,7 +1637,6 @@ fn cli() -> Command {
|
||||
.arg(endpoint_pageserver_id_arg.clone())
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
|
||||
@@ -41,15 +41,11 @@ use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::spec::Database;
|
||||
use compute_api::spec::PgIdent;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use compute_api::spec::Role;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -126,7 +122,6 @@ impl ComputeControlPlane {
|
||||
http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
skip_pg_catalog_updates: bool,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
@@ -145,7 +140,7 @@ impl ComputeControlPlane {
|
||||
// before and after start are the same. So, skip catalog updates,
|
||||
// with this we basically test a case of waking up an idle compute, where
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates,
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
});
|
||||
|
||||
@@ -160,7 +155,7 @@ impl ComputeControlPlane {
|
||||
http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates,
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
})?,
|
||||
)?;
|
||||
@@ -505,7 +500,6 @@ impl Endpoint {
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -557,26 +551,8 @@ impl Endpoint {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
roles: vec![],
|
||||
databases: vec![],
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf),
|
||||
},
|
||||
@@ -601,16 +577,11 @@ impl Endpoint {
|
||||
.open(self.endpoint_path().join("compute.log"))?;
|
||||
|
||||
// Launch compute_ctl
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{}'", conn_str);
|
||||
if create_test_user {
|
||||
let conn_str = self.connstr("user", "neondb");
|
||||
println!("Also at '{}'", conn_str);
|
||||
}
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
cmd.args(["--http-port", &self.http_address.port().to_string()])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &conn_str])
|
||||
.args(["--connstr", &self.connstr()])
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
@@ -681,9 +652,7 @@ impl Endpoint {
|
||||
}
|
||||
ComputeStatus::Empty
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::TerminationPending
|
||||
| ComputeStatus::Terminated => {
|
||||
| ComputeStatus::Configuration => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
}
|
||||
@@ -814,13 +783,13 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn connstr(&self, user: &str, db_name: &str) -> String {
|
||||
pub fn connstr(&self) -> String {
|
||||
format!(
|
||||
"postgresql://{}@{}:{}/{}",
|
||||
user,
|
||||
"cloud_admin",
|
||||
self.pg_address.ip(),
|
||||
self.pg_address.port(),
|
||||
db_name
|
||||
"postgres"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,11 +400,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
timeline_get_throttle: settings
|
||||
.remove("timeline_get_throttle")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -510,11 +505,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
timeline_get_throttle: settings
|
||||
.remove("timeline_get_throttle")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -52,10 +52,6 @@ pub enum ComputeStatus {
|
||||
// compute will exit soon or is waiting for
|
||||
// control-plane to terminate it.
|
||||
Failed,
|
||||
// Termination requested
|
||||
TerminationPending,
|
||||
// Terminated Postgres
|
||||
Terminated,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
|
||||
|
||||
@@ -201,11 +201,6 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
|
||||
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
|
||||
res[0] = self.inc.remove_label_values(vals);
|
||||
res[1] = self.dec.remove_label_values(vals);
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: Atomic> GenericCounterPair<P> {
|
||||
@@ -252,15 +247,6 @@ impl<P: Atomic> GenericCounterPair<P> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: Atomic> Clone for GenericCounterPair<P> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
inc: self.inc.clone(),
|
||||
dec: self.dec.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Guard returned by [`GenericCounterPair::guard`]
|
||||
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ hex.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ use postgres_ffi::BLCKSZ;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::key::Key;
|
||||
use itertools::Itertools;
|
||||
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
@@ -64,36 +63,9 @@ impl KeySpace {
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
/// Merge another keyspace into the current one.
|
||||
/// Note: the keyspaces must not ovelap (enforced via assertions)
|
||||
pub fn merge(&mut self, other: &KeySpace) {
|
||||
let all_ranges = self
|
||||
.ranges
|
||||
.iter()
|
||||
.merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
|
||||
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
let mut prev: Option<&Range<Key>> = None;
|
||||
for range in all_ranges {
|
||||
if let Some(prev) = prev {
|
||||
let overlap =
|
||||
std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
|
||||
assert!(
|
||||
!overlap,
|
||||
"Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
|
||||
prev, range
|
||||
);
|
||||
}
|
||||
|
||||
accum.add_range(range.clone());
|
||||
prev = Some(range);
|
||||
}
|
||||
|
||||
self.ranges = accum.to_keyspace().ranges;
|
||||
}
|
||||
|
||||
/// Remove all keys in `other` from `self`.
|
||||
/// This can involve splitting or removing of existing ranges.
|
||||
/// Update the keyspace such that it doesn't contain any range
|
||||
/// that is overlapping with `other`. This can involve splitting or
|
||||
/// removing of existing ranges.
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||
(Some(start), Some(end)) => (start, end),
|
||||
@@ -248,7 +220,16 @@ impl KeySpaceAccum {
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
std::mem::take(self).to_keyspace()
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
|
||||
let mut prev_accum = KeySpaceAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
KeySpace {
|
||||
ranges: prev_accum.ranges,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
@@ -298,13 +279,6 @@ impl KeySpaceRandomAccum {
|
||||
}
|
||||
KeySpace { ranges }
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
let mut prev_accum = KeySpaceRandomAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
prev_accum.to_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
|
||||
@@ -180,7 +180,7 @@ pub enum TimelineState {
|
||||
Broken { reason: String, backtrace: String },
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
pub new_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
@@ -214,14 +214,14 @@ impl ShardParameters {
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.count.is_unsharded()
|
||||
self.count == ShardCount(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ShardParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: ShardCount::new(0),
|
||||
count: ShardCount(0),
|
||||
stripe_size: Self::DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
@@ -283,7 +283,6 @@ pub struct TenantConfig {
|
||||
pub gc_feedback: Option<bool>,
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -310,35 +309,6 @@ pub struct EvictionPolicyLayerAccessThreshold {
|
||||
pub threshold: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct ThrottleConfig {
|
||||
pub task_kinds: Vec<String>, // TaskKind
|
||||
pub initial: usize,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub refill_interval: Duration,
|
||||
pub refill_amount: NonZeroUsize,
|
||||
pub max: usize,
|
||||
pub fair: bool,
|
||||
}
|
||||
|
||||
impl ThrottleConfig {
|
||||
pub fn disabled() -> Self {
|
||||
Self {
|
||||
task_kinds: vec![], // effectively disables the throttle
|
||||
// other values don't matter with emtpy `task_kinds`.
|
||||
initial: 0,
|
||||
refill_interval: Duration::from_millis(1),
|
||||
refill_amount: NonZeroUsize::new(1).unwrap(),
|
||||
max: 1,
|
||||
fair: true,
|
||||
}
|
||||
}
|
||||
/// The requests per second allowed by the given config.
|
||||
pub fn steady_rps(&self) -> f64 {
|
||||
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
|
||||
}
|
||||
}
|
||||
|
||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||
/// lists out all possible states (and the virtual "Detached" state)
|
||||
/// in a flat form rather than using rust-style enums.
|
||||
@@ -720,7 +690,6 @@ pub enum PagestreamFeMessage {
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
GetVectoredPages(PagestreamGetVectoredPagesRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -732,7 +701,6 @@ pub enum PagestreamBeMessage {
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
GetVectoredPages(PagestreamGetVectoredPagesResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
@@ -744,7 +712,6 @@ enum PagestreamBeMessageTag {
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
GetVectoredPages = 106,
|
||||
}
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
@@ -756,7 +723,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
106 => Ok(PagestreamBeMessageTag::GetVectoredPages),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
@@ -799,15 +765,6 @@ pub struct PagestreamGetSlruSegmentRequest {
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamGetVectoredPagesRequest {
|
||||
pub latest: bool,
|
||||
pub lsn: Lsn,
|
||||
pub rel: RelTag,
|
||||
pub blkno: u32,
|
||||
pub count: u8,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsResponse {
|
||||
pub exists: bool,
|
||||
@@ -828,12 +785,6 @@ pub struct PagestreamGetSlruSegmentResponse {
|
||||
pub segment: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetVectoredPagesResponse {
|
||||
pub page_count: u8,
|
||||
pub pages: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamErrorResponse {
|
||||
pub message: String,
|
||||
@@ -905,18 +856,6 @@ impl PagestreamFeMessage {
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
|
||||
Self::GetVectoredPages(req) => {
|
||||
bytes.put_u8(5);
|
||||
bytes.put_u8(u8::from(req.latest));
|
||||
bytes.put_u64(req.lsn.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
bytes.put_u8(req.rel.forknum);
|
||||
bytes.put_u32(req.blkno);
|
||||
bytes.put_u8(req.count);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -975,20 +914,6 @@ impl PagestreamFeMessage {
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
5 => Ok(PagestreamFeMessage::GetVectoredPages(
|
||||
PagestreamGetVectoredPagesRequest {
|
||||
latest: body.read_u8()? != 0,
|
||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
count: body.read_u8()?,
|
||||
},
|
||||
)),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
@@ -1030,12 +955,6 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
Self::GetVectoredPages(resp) => {
|
||||
bytes.put_u8(Tag::GetVectoredPages as u8);
|
||||
bytes.put_u8(resp.page_count);
|
||||
bytes.put(&resp.pages[..]);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -1084,15 +1003,6 @@ impl PagestreamBeMessage {
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
Tag::GetVectoredPages => {
|
||||
let page_count = buf.read_u8()?;
|
||||
let mut pages = vec![0; page_count as usize * 8192];
|
||||
buf.read_exact(&mut pages)?;
|
||||
Self::GetVectoredPages(PagestreamGetVectoredPagesResponse {
|
||||
page_count,
|
||||
pages: pages.into(),
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
@@ -1112,7 +1022,6 @@ impl PagestreamBeMessage {
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
Self::GetVectoredPages(_) => "GetVectoredPages",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,41 +13,10 @@ use utils::id::TenantId;
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(u8);
|
||||
pub struct ShardCount(pub u8);
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as `TenantShardId::unsharded`.
|
||||
///
|
||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||
pub fn count(&self) -> u8 {
|
||||
if self.0 > 0 {
|
||||
self.0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
|
||||
/// The literal internal value: this is **not** the number of shards in the
|
||||
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
|
||||
/// [`Self::count`] if you want to know the cardinality of shards.
|
||||
pub fn literal(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
|
||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||
/// [`Self::literal`] would return.
|
||||
pub fn new(val: u8) -> Self {
|
||||
Self(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardNumber {
|
||||
@@ -117,7 +86,7 @@ impl TenantShardId {
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
|
||||
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||
@@ -502,12 +471,10 @@ impl ShardIdentity {
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A1: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
// A2: because key_is_shard0 also covers relation size keys, which are written
|
||||
// on all shards even though they're only maintained accurately on shard 0.
|
||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
|
||||
@@ -25,7 +25,6 @@ hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
@@ -29,9 +29,6 @@ pub enum Scope {
|
||||
// Should only be used e.g. for status check.
|
||||
// Currently also used for connection from any pageserver to any safekeeper.
|
||||
SafekeeperData,
|
||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
#[serde(rename = "generations_api")]
|
||||
GenerationsApi,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
|
||||
@@ -12,7 +12,6 @@ testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -22,6 +21,7 @@ camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
close_fds.workspace = true
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
@@ -36,7 +36,6 @@ humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
md5.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
@@ -84,7 +83,7 @@ workspace_hack.workspace = true
|
||||
reqwest.workspace = true
|
||||
rpds.workspace = true
|
||||
enum-map.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
enumset.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
|
||||
@@ -6,28 +6,14 @@
|
||||
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
|
||||
//! logging what happens when a sequential scan is requested on a small table, then picking out two
|
||||
//! suitable from logs.
|
||||
//!
|
||||
//!
|
||||
//! Reference data (git blame to see commit) on an i3en.3xlarge
|
||||
// ```text
|
||||
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
|
||||
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
|
||||
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
|
||||
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
|
||||
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
|
||||
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
|
||||
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
|
||||
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
|
||||
//! ``
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, Barrier};
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
@@ -53,11 +39,11 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
.build()
|
||||
.unwrap();
|
||||
tracing::info!("executing first");
|
||||
rt.block_on(short().execute(&manager)).unwrap();
|
||||
short().execute(rt.handle(), &manager).unwrap();
|
||||
tracing::info!("first executed");
|
||||
}
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
let thread_counts = [1, 2, 4, 8, 16];
|
||||
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
@@ -88,69 +74,114 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
drop(group);
|
||||
}
|
||||
|
||||
/// Sets up a multi-threaded tokio runtime with default worker thread count,
|
||||
/// then, spawn `requesters` tasks that repeatedly:
|
||||
/// - get input from `input_factor()`
|
||||
/// - call `manager.request_redo()` with their input
|
||||
///
|
||||
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
|
||||
///
|
||||
/// Using tokio's default worker thread count means the results will differ on machines
|
||||
/// with different core countrs. We don't care about that, the performance will always
|
||||
/// be different on different hardware. To compare performance of different software versions,
|
||||
/// use the same hardware.
|
||||
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
|
||||
fn add_multithreaded_walredo_requesters(
|
||||
b: &mut criterion::Bencher,
|
||||
nrequesters: usize,
|
||||
threads: u32,
|
||||
manager: &Arc<PostgresRedoManager>,
|
||||
input_factory: fn() -> Request,
|
||||
) {
|
||||
assert_ne!(nrequesters, 0);
|
||||
assert_ne!(threads, 0);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
if threads == 1 {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
b.iter_batched_ref(
|
||||
|| Some(input_factory()),
|
||||
|input| execute_all(input.take(), handle, manager),
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
} else {
|
||||
let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
|
||||
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
|
||||
let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
|
||||
|
||||
let mut requesters = JoinSet::new();
|
||||
for _ in 0..nrequesters {
|
||||
let _entered = rt.enter();
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
requesters.spawn(async move {
|
||||
loop {
|
||||
let input = input_factory();
|
||||
barrier.wait().await;
|
||||
let page = input.execute(&manager).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
barrier.wait().await;
|
||||
}
|
||||
});
|
||||
let barrier = Arc::new(Barrier::new(threads as usize + 1));
|
||||
|
||||
let jhs = (0..threads)
|
||||
.map(|_| {
|
||||
std::thread::spawn({
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
let work_rx = work_rx.clone();
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, handle, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let _jhs = JoinOnDrop(jhs);
|
||||
|
||||
b.iter_batched(
|
||||
|| {
|
||||
for _ in 0..threads {
|
||||
work_tx.send(()).unwrap()
|
||||
}
|
||||
},
|
||||
|()| {
|
||||
// start the work
|
||||
barrier.wait();
|
||||
|
||||
// wait for work to complete
|
||||
barrier.wait();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
drop(work_tx);
|
||||
}
|
||||
}
|
||||
|
||||
let do_one_iteration = || {
|
||||
rt.block_on(async {
|
||||
barrier.wait().await;
|
||||
// wait for work to complete
|
||||
barrier.wait().await;
|
||||
})
|
||||
};
|
||||
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
|
||||
|
||||
b.iter_batched(
|
||||
|| {
|
||||
// warmup
|
||||
do_one_iteration();
|
||||
},
|
||||
|()| {
|
||||
// work loop
|
||||
do_one_iteration();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
impl Drop for JoinOnDrop {
|
||||
// it's not really needless because we want join all then check for panicks
|
||||
#[allow(clippy::needless_collect)]
|
||||
fn drop(&mut self) {
|
||||
// first join all
|
||||
let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
|
||||
// then check the results; panicking here is not great, but it does get the message across
|
||||
// to the user, and sets an exit value.
|
||||
results.into_iter().try_for_each(|res| res).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
rt.block_on(requesters.shutdown());
|
||||
fn execute_all<I>(
|
||||
input: I,
|
||||
handle: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = Request>,
|
||||
{
|
||||
// just fire all requests as fast as possible
|
||||
input.into_iter().try_for_each(|req| {
|
||||
let page = req.execute(handle, manager)?;
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
anyhow::Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
criterion_group!(benches, redo_scenarios);
|
||||
@@ -462,7 +493,11 @@ struct Request {
|
||||
}
|
||||
|
||||
impl Request {
|
||||
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
fn execute(
|
||||
self,
|
||||
rt: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
@@ -471,8 +506,6 @@ impl Request {
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
manager
|
||||
.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,8 +4,7 @@ use futures::SinkExt;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
PagestreamGetPageResponse, PagestreamGetVectoredPagesRequest,
|
||||
PagestreamGetVectoredPagesResponse,
|
||||
PagestreamGetPageResponse,
|
||||
},
|
||||
reltag::RelTag,
|
||||
};
|
||||
@@ -158,39 +157,7 @@ impl PagestreamClient {
|
||||
PagestreamBeMessage::Exists(_)
|
||||
| PagestreamBeMessage::Nblocks(_)
|
||||
| PagestreamBeMessage::DbSize(_)
|
||||
| PagestreamBeMessage::GetSlruSegment(_)
|
||||
| PagestreamBeMessage::GetVectoredPages(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
msg.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn getpages(
|
||||
&mut self,
|
||||
req: PagestreamGetVectoredPagesRequest,
|
||||
) -> anyhow::Result<PagestreamGetVectoredPagesResponse> {
|
||||
let req = PagestreamFeMessage::GetVectoredPages(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
PagestreamBeMessage::GetVectoredPages(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
| PagestreamBeMessage::Nblocks(_)
|
||||
| PagestreamBeMessage::DbSize(_)
|
||||
| PagestreamBeMessage::GetSlruSegment(_)
|
||||
| PagestreamBeMessage::GetPage(_) => {
|
||||
| PagestreamBeMessage::GetSlruSegment(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
msg.kind()
|
||||
|
||||
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, instrument};
|
||||
use tracing::{debug, info, instrument};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
@@ -28,8 +28,6 @@ pub(crate) struct Args {
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
page_service_connstring: Option<String>,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
@@ -232,17 +230,12 @@ async fn client(
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let connstr = match &args.page_service_connstring {
|
||||
Some(connstr) => connstr.clone(),
|
||||
None => crate::util::connstring::connstring(
|
||||
&args.page_service_host_port,
|
||||
args.pageserver_jwt.as_deref(),
|
||||
),
|
||||
};
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(connstr)
|
||||
.await
|
||||
.unwrap();
|
||||
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
|
||||
&args.page_service_host_port,
|
||||
args.pageserver_jwt.as_deref(),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
@@ -270,7 +263,7 @@ async fn client(
|
||||
}
|
||||
})
|
||||
.await;
|
||||
info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::future::join_all;
|
||||
use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamGetVectoredPagesRequest};
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::info;
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -36,12 +38,8 @@ pub(crate) struct Args {
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
/// Each client sends requests at the given rate.
|
||||
///
|
||||
/// If a request takes too long and we should be issuing a new request already,
|
||||
/// we skip that request and account it as `MISSED`.
|
||||
#[clap(long)]
|
||||
per_client_rate: Option<usize>,
|
||||
per_target_rate_limit: Option<usize>,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long, default_value = "1")]
|
||||
req_latest_probability: f64,
|
||||
@@ -57,24 +55,18 @@ pub(crate) struct Args {
|
||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||
#[clap(long)]
|
||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||
#[clap(long)]
|
||||
vectored_read_size: Option<u8>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
missed: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn request_done(&self) {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn missed(&self, n: u64) {
|
||||
self.missed.fetch_add(n, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, serde::Serialize, serde::Deserialize)]
|
||||
@@ -228,12 +220,13 @@ async fn main_impl(
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = args.num_clients.get() * timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = args.num_clients.get() * timelines.len();
|
||||
let num_work_sender_tasks = 1;
|
||||
let num_main_impl = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_live_stats_dump + num_work_sender_tasks + num_main_impl,
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
|
||||
));
|
||||
|
||||
tokio::spawn({
|
||||
@@ -245,12 +238,10 @@ async fn main_impl(
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let missed = stats.missed.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0} MISSED: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64(),
|
||||
missed as f64 / elapsed.as_secs_f64()
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -258,128 +249,127 @@ async fn main_impl(
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let rps_period = args
|
||||
.per_client_rate
|
||||
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
|
||||
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
|
||||
let live_stats = live_stats.clone();
|
||||
let start_work_barrier = start_work_barrier.clone();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == worker_id.timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights =
|
||||
rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
|
||||
.unwrap();
|
||||
|
||||
let cancel = cancel.clone();
|
||||
Box::pin(async move {
|
||||
let client =
|
||||
pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
let periods_passed_until_now =
|
||||
usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
|
||||
.unwrap();
|
||||
|
||||
if periods_passed_until_now > ticks_processed {
|
||||
live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
|
||||
}
|
||||
ticks_processed = periods_passed_until_now;
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
if let Some(size) = args.vectored_read_size {
|
||||
assert!(size > 0);
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
|
||||
PagestreamGetVectoredPagesRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
count: size
|
||||
}
|
||||
};
|
||||
client.getpages(req).await.unwrap();
|
||||
} else {
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
client.getpage(req).await.unwrap();
|
||||
}
|
||||
let end = Instant::now();
|
||||
live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
STATS.with(|stats| {
|
||||
stats
|
||||
.borrow()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.observe(end.duration_since(start))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
if let Some(period) = &rps_period {
|
||||
let next_at = client_start
|
||||
+ Duration::from_micros(
|
||||
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
|
||||
);
|
||||
tokio::time::sleep_until(next_at.into()).await;
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
info!("spawning workers");
|
||||
let mut workers = JoinSet::new();
|
||||
let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for timeline in timelines.iter().cloned() {
|
||||
for num_client in 0..args.num_clients.get() {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
let worker_id = WorkerId {
|
||||
timeline,
|
||||
num_client,
|
||||
};
|
||||
workers.spawn(make_worker(worker_id));
|
||||
work_senders.insert(worker_id, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
worker_id,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&live_stats),
|
||||
cancel.clone(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
let workers = async move {
|
||||
while let Some(res) = workers.join_next().await {
|
||||
res.unwrap();
|
||||
|
||||
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
|
||||
let start_work_barrier = start_work_barrier.clone();
|
||||
let cancel = cancel.clone();
|
||||
match args.per_target_rate_limit {
|
||||
None => Box::pin(async move {
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
all_ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
let (timeline, req) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &all_ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(
|
||||
WorkerId {
|
||||
timeline: r.timeline,
|
||||
num_client: rng.gen_range(0..args.num_clients.get()),
|
||||
},
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
|
||||
}
|
||||
}
|
||||
}),
|
||||
Some(rps_limit) => Box::pin(async move {
|
||||
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
|
||||
let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
|
||||
&|worker_id| {
|
||||
let sender = work_senders.get(&worker_id).unwrap();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == worker_id.timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let cancel = cancel.clone();
|
||||
Box::pin(async move {
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(
|
||||
/* TODO review this choice */
|
||||
tokio::time::MissedTickBehavior::Burst,
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
ticker.tick().await;
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(
|
||||
cancel.is_cancelled(),
|
||||
"client has gone away unexpectedly"
|
||||
);
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
join_all(tasks).await;
|
||||
}),
|
||||
}
|
||||
};
|
||||
|
||||
let work_sender_task = tokio::spawn(work_sender);
|
||||
|
||||
info!("waiting for everything to become ready");
|
||||
start_work_barrier.wait().await;
|
||||
info!("work started");
|
||||
@@ -387,13 +377,20 @@ async fn main_impl(
|
||||
tokio::time::sleep(runtime.into()).await;
|
||||
info!("runtime over, signalling cancellation");
|
||||
cancel.cancel();
|
||||
workers.await;
|
||||
work_sender_task.await.unwrap();
|
||||
info!("work sender exited");
|
||||
} else {
|
||||
workers.await;
|
||||
work_sender_task.await.unwrap();
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
info!("joining clients");
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
info!("all clients stopped");
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
@@ -410,3 +407,49 @@ async fn main_impl(
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
id: WorkerId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
let WorkerId {
|
||||
timeline,
|
||||
num_client: _,
|
||||
} = id;
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let do_requests = async {
|
||||
start_work_barrier.wait().await;
|
||||
while let Some(req) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(req)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
};
|
||||
tokio::select! {
|
||||
res = do_requests => { res },
|
||||
_ = cancel.cancelled() => {
|
||||
// fallthrough to shutdown
|
||||
}
|
||||
}
|
||||
client.shutdown().await;
|
||||
}
|
||||
|
||||
@@ -14,12 +14,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||
(Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
(Scope::SafekeeperData, _) => Err(AuthError(
|
||||
"SafekeeperData scope makes no sense for Pageserver".into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,7 +143,6 @@ where
|
||||
ar: &'a mut Builder<&'b mut W>,
|
||||
buf: Vec<u8>,
|
||||
current_segment: Option<(SlruKind, u32)>,
|
||||
total_blocks: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
|
||||
@@ -155,7 +154,6 @@ where
|
||||
ar,
|
||||
buf: Vec::new(),
|
||||
current_segment: None,
|
||||
total_blocks: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,8 +199,7 @@ where
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
||||
|
||||
self.total_blocks += nblocks;
|
||||
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
|
||||
self.buf.clear();
|
||||
|
||||
@@ -210,15 +207,11 @@ where
|
||||
}
|
||||
|
||||
async fn finish(mut self) -> anyhow::Result<()> {
|
||||
let res = if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
self.flush().await
|
||||
};
|
||||
if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!("Collected {} SLRU blocks", self.total_blocks);
|
||||
|
||||
res
|
||||
self.flush().await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,7 +261,10 @@ where
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(&part.ranges, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
slru_builder.add_block(&key, block?).await?;
|
||||
|
||||
@@ -33,7 +33,6 @@ use utils::{
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
@@ -85,12 +84,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_SIZE: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -128,12 +121,6 @@ pub mod defaults {
|
||||
|
||||
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
|
||||
|
||||
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
|
||||
|
||||
#max_vectored_read_size = '{DEFAULT_MAX_VECTORED_READ_SIZE}'
|
||||
|
||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -269,12 +256,6 @@ pub struct PageServerConf {
|
||||
pub ingest_batch_size: u64,
|
||||
|
||||
pub virtual_file_io_engine: virtual_file::IoEngineKind,
|
||||
|
||||
pub get_vectored_impl: GetVectoredImpl,
|
||||
|
||||
pub max_vectored_read_size: usize,
|
||||
|
||||
pub validate_vectored_get: bool,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -361,12 +342,6 @@ struct PageServerConfigBuilder {
|
||||
ingest_batch_size: BuilderValue<u64>,
|
||||
|
||||
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
|
||||
|
||||
get_vectored_impl: BuilderValue<GetVectoredImpl>,
|
||||
|
||||
max_vectored_read_size: BuilderValue<usize>,
|
||||
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -444,10 +419,6 @@ impl Default for PageServerConfigBuilder {
|
||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||
|
||||
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
||||
|
||||
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
|
||||
max_vectored_read_size: Set(DEFAULT_MAX_VECTORED_READ_SIZE),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -608,18 +579,6 @@ impl PageServerConfigBuilder {
|
||||
self.virtual_file_io_engine = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
|
||||
self.get_vectored_impl = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_max_vectored_read_size(&mut self, value: usize) {
|
||||
self.max_vectored_read_size = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_validate_vectored_get(&mut self, value: bool) {
|
||||
self.validate_vectored_get = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
@@ -730,15 +689,6 @@ impl PageServerConfigBuilder {
|
||||
virtual_file_io_engine: self
|
||||
.virtual_file_io_engine
|
||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
||||
get_vectored_impl: self
|
||||
.get_vectored_impl
|
||||
.ok_or(anyhow!("missing get_vectored_impl"))?,
|
||||
max_vectored_read_size: self
|
||||
.max_vectored_read_size
|
||||
.ok_or(anyhow!("missing max_vectored_read_size"))?,
|
||||
validate_vectored_get: self
|
||||
.validate_vectored_get
|
||||
.ok_or(anyhow!("missing validate_vectored_get"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -993,15 +943,6 @@ impl PageServerConf {
|
||||
"virtual_file_io_engine" => {
|
||||
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
|
||||
}
|
||||
"get_vectored_impl" => {
|
||||
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
|
||||
}
|
||||
"max_vectored_read_size" => {
|
||||
builder.get_max_vectored_read_size(parse_toml_u64("max_vectored_read_size", item)? as usize)
|
||||
}
|
||||
"validate_vectored_get" => {
|
||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1076,9 +1017,6 @@ impl PageServerConf {
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
max_vectored_read_size: defaults::DEFAULT_MAX_VECTORED_READ_SIZE,
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1312,9 +1250,6 @@ background_task_maximum_delay = '334 s'
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
max_vectored_read_size: defaults::DEFAULT_MAX_VECTORED_READ_SIZE,
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1379,9 +1314,6 @@ background_task_maximum_delay = '334 s'
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: 100,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
max_vectored_read_size: defaults::DEFAULT_MAX_VECTORED_READ_SIZE,
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -83,12 +83,12 @@ use utils::{
|
||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||
// failed API calls while tenants are activating.
|
||||
#[cfg(not(feature = "testing"))]
|
||||
pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
|
||||
// finish attaching, if calls to remote storage are slow.
|
||||
#[cfg(feature = "testing")]
|
||||
pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
|
||||
pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -571,16 +571,10 @@ async fn timeline_list_handler(
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
@@ -622,7 +616,7 @@ async fn timeline_preserve_initdb_handler(
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -1142,7 +1136,7 @@ async fn tenant_shard_split_handler(
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
|
||||
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@ use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
|
||||
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
|
||||
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -1266,12 +1266,13 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"pageserver_remote_timeline_client_calls_started",
|
||||
"Number of started calls to remote timeline client.",
|
||||
"pageserver_remote_timeline_client_calls_finished",
|
||||
"Number of finshed calls to remote timeline client.",
|
||||
/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
"Number of ongoing calls to remote timeline client. \
|
||||
Used to populate pageserver_remote_timeline_client_calls_started. \
|
||||
This metric is not useful for sampling from Prometheus, but useful in tests.",
|
||||
&[
|
||||
"tenant_id",
|
||||
"shard_id",
|
||||
@@ -1280,7 +1281,23 @@ static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
"op_kind"
|
||||
],
|
||||
)
|
||||
.unwrap()
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_timeline_client_calls_started",
|
||||
"When calling a remote timeline client method, we record the current value \
|
||||
of the calls_unfinished gauge in this histogram. Plot the histogram \
|
||||
over time in a heatmap to visualize how many operations were ongoing \
|
||||
at a given instant. It gives you a better idea of the queue depth \
|
||||
than plotting the gauge directly, since operations may complete faster \
|
||||
than the sampling interval.",
|
||||
&["file_kind", "op_kind"],
|
||||
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
|
||||
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
|
||||
@@ -2061,7 +2078,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
}
|
||||
@@ -2072,7 +2089,7 @@ impl RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||
shard_id: format!("{}", tenant_shard_id.shard_slug()),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
calls: Mutex::new(HashMap::default()),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
@@ -2112,15 +2129,15 @@ impl RemoteTimelineClientMetrics {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn calls_counter_pair(
|
||||
fn calls_unfinished_gauge(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntCounterPair {
|
||||
let mut guard = self.calls.lock().unwrap();
|
||||
) -> IntGauge {
|
||||
let mut guard = self.calls_unfinished_gauge.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_CALLS
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
@@ -2133,6 +2150,17 @@ impl RemoteTimelineClientMetrics {
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn calls_started_hist(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Histogram {
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[key.0, key.1])
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn bytes_started_counter(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
@@ -2203,7 +2231,7 @@ impl RemoteTimelineClientMetrics {
|
||||
#[must_use]
|
||||
pub(crate) struct RemoteTimelineClientCallMetricGuard {
|
||||
/// Decremented on drop.
|
||||
calls_counter_pair: Option<IntCounterPair>,
|
||||
calls_unfinished_metric: Option<IntGauge>,
|
||||
/// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
|
||||
bytes_finished: Option<(IntCounter, u64)>,
|
||||
}
|
||||
@@ -2213,10 +2241,10 @@ impl RemoteTimelineClientCallMetricGuard {
|
||||
/// The caller vouches to do the metric updates manually.
|
||||
pub fn will_decrement_manually(mut self) {
|
||||
let RemoteTimelineClientCallMetricGuard {
|
||||
calls_counter_pair,
|
||||
calls_unfinished_metric,
|
||||
bytes_finished,
|
||||
} = &mut self;
|
||||
calls_counter_pair.take();
|
||||
calls_unfinished_metric.take();
|
||||
bytes_finished.take();
|
||||
}
|
||||
}
|
||||
@@ -2224,10 +2252,10 @@ impl RemoteTimelineClientCallMetricGuard {
|
||||
impl Drop for RemoteTimelineClientCallMetricGuard {
|
||||
fn drop(&mut self) {
|
||||
let RemoteTimelineClientCallMetricGuard {
|
||||
calls_counter_pair,
|
||||
calls_unfinished_metric,
|
||||
bytes_finished,
|
||||
} = self;
|
||||
if let Some(guard) = calls_counter_pair.take() {
|
||||
if let Some(guard) = calls_unfinished_metric.take() {
|
||||
guard.dec();
|
||||
}
|
||||
if let Some((bytes_finished_metric, value)) = bytes_finished {
|
||||
@@ -2260,8 +2288,10 @@ impl RemoteTimelineClientMetrics {
|
||||
op_kind: &RemoteOpKind,
|
||||
size: RemoteTimelineClientMetricsCallTrackSize,
|
||||
) -> RemoteTimelineClientCallMetricGuard {
|
||||
let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
|
||||
calls_counter_pair.inc();
|
||||
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
self.calls_started_hist(file_kind, op_kind)
|
||||
.observe(calls_unfinished_metric.get() as f64);
|
||||
calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
|
||||
|
||||
let bytes_finished = match size {
|
||||
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
|
||||
@@ -2275,7 +2305,7 @@ impl RemoteTimelineClientMetrics {
|
||||
}
|
||||
};
|
||||
RemoteTimelineClientCallMetricGuard {
|
||||
calls_counter_pair: Some(calls_counter_pair),
|
||||
calls_unfinished_metric: Some(calls_unfinished_metric),
|
||||
bytes_finished,
|
||||
}
|
||||
}
|
||||
@@ -2289,8 +2319,12 @@ impl RemoteTimelineClientMetrics {
|
||||
op_kind: &RemoteOpKind,
|
||||
size: RemoteTimelineClientMetricsCallTrackSize,
|
||||
) {
|
||||
let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
|
||||
calls_counter_pair.dec();
|
||||
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
debug_assert!(
|
||||
calls_unfinished_metric.get() > 0,
|
||||
"begin and end should cancel out"
|
||||
);
|
||||
calls_unfinished_metric.dec();
|
||||
match size {
|
||||
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
|
||||
@@ -2307,15 +2341,18 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
calls,
|
||||
calls_unfinished_gauge,
|
||||
bytes_started_counter,
|
||||
bytes_finished_counter,
|
||||
} = self;
|
||||
for ((a, b), _) in calls.get_mut().unwrap().drain() {
|
||||
let mut res = [Ok(()), Ok(())];
|
||||
REMOTE_TIMELINE_CLIENT_CALLS
|
||||
.remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
|
||||
// don't care about results
|
||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||
@@ -2459,56 +2496,6 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod tenant_throttling {
|
||||
use metrics::{register_int_counter_vec, IntCounter};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::tenant::{self, throttle::Metric};
|
||||
|
||||
pub(crate) struct TimelineGet {
|
||||
wait_time: IntCounter,
|
||||
count: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
|
||||
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum_global",
|
||||
"Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_global",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let kind = "timeline_get";
|
||||
TimelineGet {
|
||||
wait_time: WAIT_USECS.with_label_values(&[kind]),
|
||||
count: WAIT_COUNT.with_label_values(&[kind]),
|
||||
}
|
||||
});
|
||||
|
||||
impl Metric for &'static TimelineGet {
|
||||
#[inline(always)]
|
||||
fn observe_throttling(
|
||||
&self,
|
||||
tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
|
||||
) {
|
||||
let val = u64::try_from(wait_time.as_micros()).unwrap();
|
||||
self.wait_time.inc_by(val);
|
||||
self.count.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
@@ -2570,5 +2557,4 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// Custom
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&tenant_throttling::TIMELINE_GET);
|
||||
}
|
||||
|
||||
@@ -17,8 +17,6 @@ use futures::stream::FuturesUnordered;
|
||||
use futures::Stream;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::PagestreamGetVectoredPagesRequest;
|
||||
use pageserver_api::models::PagestreamGetVectoredPagesResponse;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
@@ -28,7 +26,7 @@ use pageserver_api::models::{
|
||||
PagestreamNblocksResponse,
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
@@ -73,7 +71,6 @@ use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::timeline::WaitLsnError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -337,10 +334,6 @@ enum PageStreamError {
|
||||
#[error("Read error")]
|
||||
Read(#[source] PageReconstructError),
|
||||
|
||||
/// Something went wrong reading a page: this likely indicates a pageserver bug
|
||||
#[error("Vectored read error")]
|
||||
VectoredRead(#[source] GetVectoredError),
|
||||
|
||||
/// Ran out of time waiting for an LSN
|
||||
#[error("LSN timeout: {0}")]
|
||||
LsnTimeout(WaitLsnError),
|
||||
@@ -364,15 +357,6 @@ impl From<PageReconstructError> for PageStreamError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetVectoredError> for PageStreamError {
|
||||
fn from(value: GetVectoredError) -> Self {
|
||||
match value {
|
||||
GetVectoredError::Cancelled => Self::Shutdown,
|
||||
e => Self::VectoredRead(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetActiveTimelineError> for PageStreamError {
|
||||
fn from(value: GetActiveTimelineError) -> Self {
|
||||
match value {
|
||||
@@ -682,15 +666,6 @@ impl PageServerHandler {
|
||||
span,
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetVectoredPages(req) => {
|
||||
let span = tracing::info_span!("handle_get_vectored_pages_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn, req_count = %req.count);
|
||||
(
|
||||
self.handle_get_pages_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
span,
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
match response {
|
||||
@@ -1023,7 +998,7 @@ impl PageServerHandler {
|
||||
) -> Result<&Arc<Timeline>, Key> {
|
||||
let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
|
||||
// Fastest path: single sharded case
|
||||
if first_idx.shard_count.count() == 1 {
|
||||
if first_idx.shard_count < ShardCount(2) {
|
||||
return Ok(&first_timeline.timeline);
|
||||
}
|
||||
|
||||
@@ -1186,80 +1161,6 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_pages_at_lsn_request(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamGetVectoredPagesRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
// This is cheeky and relies on not using sharding :)
|
||||
// A real solution has to split the requested key sequence between shards.
|
||||
let get_page_request = PagestreamGetPageRequest {
|
||||
latest: req.latest,
|
||||
lsn: req.lsn,
|
||||
rel: req.rel,
|
||||
blkno: req.blkno,
|
||||
};
|
||||
|
||||
let timeline = match self.get_cached_timeline_for_page(&get_page_request) {
|
||||
Ok(tl) => tl,
|
||||
Err(key) => {
|
||||
match self
|
||||
.load_timeline_for_page(tenant_id, timeline_id, key)
|
||||
.await
|
||||
{
|
||||
Ok(t) => t,
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
//
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return Err(PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into(),
|
||||
));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
|
||||
set_tracing_field_shard_id(timeline);
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let (page_count, pages_buf) = timeline
|
||||
.get_rel_pages_at_lsn(
|
||||
req.rel,
|
||||
req.blkno,
|
||||
req.count,
|
||||
Version::Lsn(lsn),
|
||||
req.latest,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetVectoredPages(
|
||||
PagestreamGetVectoredPagesResponse {
|
||||
page_count,
|
||||
pages: pages_buf,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_slru_segment_request(
|
||||
&mut self,
|
||||
|
||||
@@ -11,12 +11,10 @@ use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::repository::*;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -28,7 +26,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
@@ -199,41 +197,6 @@ impl Timeline {
|
||||
version.get(self, key, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn get_rel_pages_at_lsn(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
blknum: BlockNumber,
|
||||
count: u8,
|
||||
version: Version<'_>,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(u8, Bytes), GetVectoredError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(GetVectoredError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
));
|
||||
}
|
||||
|
||||
let nblocks = self
|
||||
.get_rel_size(tag, version, latest, ctx)
|
||||
.await
|
||||
.map_err(|e| GetVectoredError::Other(anyhow!(e)))?;
|
||||
if blknum + (count - 1) as u32 >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag,
|
||||
blknum,
|
||||
version.get_lsn(),
|
||||
nblocks
|
||||
);
|
||||
return Ok((1, ZERO_PAGE.clone()));
|
||||
}
|
||||
|
||||
let start_key = rel_block_to_key(tag, blknum);
|
||||
let end_key = start_key.add(count as u32);
|
||||
version.get_vectored(self, start_key..end_key, ctx).await
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
pub(crate) async fn get_db_size(
|
||||
&self,
|
||||
@@ -1529,7 +1492,7 @@ impl<'a> DatadirModification<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut writer = self.tline.writer().await;
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
@@ -1568,23 +1531,13 @@ impl<'a> DatadirModification<'a> {
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
let prev_pending_updates = std::mem::take(&mut self.pending_updates);
|
||||
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
|
||||
.into_iter()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
|
||||
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
|
||||
.collect();
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||
self.pending_updates.clear();
|
||||
}
|
||||
|
||||
@@ -1645,55 +1598,6 @@ impl<'a> DatadirModification<'a> {
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
async fn get_vectored(
|
||||
&self,
|
||||
key_range: Range<Key>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
// Have we already updated the same key? Read the latest pending updated
|
||||
// version in that case.
|
||||
//
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
|
||||
let mut keys_in_modification = KeySpaceAccum::new();
|
||||
|
||||
let key = key_range.start;
|
||||
while key != key_range.end {
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, value)) = values.last() {
|
||||
keys_in_modification.add_key(key);
|
||||
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
results.insert(key, Ok(img.clone()));
|
||||
}
|
||||
_ => {
|
||||
results.insert(
|
||||
key,
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
))),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
|
||||
let mut keyspace = KeySpace {
|
||||
ranges: vec![key_range],
|
||||
};
|
||||
keyspace.remove_overlapping_with(&keys_in_modification.to_keyspace());
|
||||
|
||||
let pages = self.tline.get_vectored(keyspace, lsn, ctx).await?;
|
||||
results.extend(pages.into_iter());
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
@@ -1737,43 +1641,6 @@ impl<'a> Version<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_vectored(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
key_range: Range<Key>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(u8, Bytes), GetVectoredError> {
|
||||
let pages = match self {
|
||||
Version::Lsn(lsn) => {
|
||||
timeline
|
||||
.get_vectored(
|
||||
KeySpace {
|
||||
ranges: vec![key_range],
|
||||
},
|
||||
*lsn,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Version::Modified(modification) => modification.get_vectored(key_range, ctx).await,
|
||||
}?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
let page_count: u8 = pages.len().try_into().expect("too many pages returned");
|
||||
for page in pages {
|
||||
match page {
|
||||
(_key, Ok(bytes)) => {
|
||||
buf.extend_from_slice(&bytes[..]);
|
||||
}
|
||||
(_key, Err(err)) => {
|
||||
return Err(GetVectoredError::Other(anyhow!(err)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((page_count, buf.freeze()))
|
||||
}
|
||||
|
||||
fn get_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
Version::Lsn(lsn) => *lsn,
|
||||
|
||||
@@ -188,7 +188,6 @@ task_local! {
|
||||
serde::Serialize,
|
||||
serde::Deserialize,
|
||||
strum_macros::IntoStaticStr,
|
||||
strum_macros::EnumString,
|
||||
)]
|
||||
pub enum TaskKind {
|
||||
// Pageserver startup, i.e., `main`
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -9,8 +9,8 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::de::IntoDeserializer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -251,7 +251,7 @@ impl LocationConf {
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(conf.shard_number),
|
||||
ShardCount::new(conf.shard_count),
|
||||
ShardCount(conf.shard_count),
|
||||
ShardStripeSize(conf.shard_stripe_size),
|
||||
)?
|
||||
};
|
||||
@@ -285,7 +285,7 @@ impl Default for LocationConf {
|
||||
///
|
||||
/// For storing and transmitting individual tenant's configuration, see
|
||||
/// TenantConfOpt.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct TenantConf {
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
@@ -348,13 +348,11 @@ pub struct TenantConf {
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
/// which parameters are set and which are not.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
@@ -439,9 +437,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -490,10 +485,6 @@ impl TenantConfOpt {
|
||||
lazy_slru_download: self
|
||||
.lazy_slru_download
|
||||
.unwrap_or(global_conf.lazy_slru_download),
|
||||
timeline_get_throttle: self
|
||||
.timeline_get_throttle
|
||||
.clone()
|
||||
.unwrap_or(global_conf.timeline_get_throttle),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -533,7 +524,6 @@ impl Default for TenantConf {
|
||||
gc_feedback: false,
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -606,7 +596,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
gc_feedback: value.gc_feedback,
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,8 +246,6 @@ async fn cleanup_remaining_fs_traces(
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
|
||||
|
||||
rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
|
||||
@@ -52,7 +52,8 @@ use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
@@ -146,28 +147,43 @@ impl Drop for BatchedUpdates<'_> {
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
#[derive(Eq, PartialEq, Debug, Hash)]
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<PersistentLayerDesc>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
/// Return value of [`LayerMap::range_search`]
|
||||
///
|
||||
/// Contains a mapping from a layer description to a keyspace
|
||||
/// accumulator that contains all the keys which intersect the layer
|
||||
/// from the original search space. Keys that were not found are accumulated
|
||||
/// in a separate key space accumulator.
|
||||
#[derive(Debug)]
|
||||
pub struct OrderedSearchResult(SearchResult);
|
||||
|
||||
impl Ord for OrderedSearchResult {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.0.lsn_floor.cmp(&other.0.lsn_floor)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for OrderedSearchResult {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OrderedSearchResult {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.lsn_floor == other.0.lsn_floor
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OrderedSearchResult {}
|
||||
|
||||
pub struct RangeSearchResult {
|
||||
pub found: HashMap<SearchResult, KeySpaceAccum>,
|
||||
pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
|
||||
pub not_found: KeySpaceAccum,
|
||||
}
|
||||
|
||||
impl RangeSearchResult {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
found: HashMap::new(),
|
||||
found: BTreeMap::new(),
|
||||
not_found: KeySpaceAccum::new(),
|
||||
}
|
||||
}
|
||||
@@ -298,7 +314,7 @@ where
|
||||
Some(search_result) => self
|
||||
.result
|
||||
.found
|
||||
.entry(search_result)
|
||||
.entry(OrderedSearchResult(search_result))
|
||||
.or_default()
|
||||
.add_range(covered_range),
|
||||
None => self.pad_range(covered_range),
|
||||
@@ -346,35 +362,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub enum InMemoryLayerHandle {
|
||||
Open {
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
Frozen {
|
||||
idx: usize,
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
impl InMemoryLayerHandle {
|
||||
pub fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
|
||||
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_end_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
|
||||
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
@@ -569,43 +556,6 @@ impl LayerMap {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
/// Get a handle for the first in memory layer that matches the provided predicate.
|
||||
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
|
||||
///
|
||||
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
|
||||
/// the same exclusive region established by holding the layer manager lock.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
|
||||
where
|
||||
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
||||
{
|
||||
if let Some(open) = &self.open_layer {
|
||||
if pred(open) {
|
||||
return Some(InMemoryLayerHandle::Open {
|
||||
lsn_floor: open.get_lsn_range().start,
|
||||
end_lsn: open.get_lsn_range().end,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.frozen_layers.iter().rev().position(pred);
|
||||
pos.map(|rev_idx| {
|
||||
let idx = self.frozen_layers.len() - 1 - rev_idx;
|
||||
InMemoryLayerHandle::Frozen {
|
||||
idx,
|
||||
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
|
||||
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the layer pointed to by the provided handle.
|
||||
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
|
||||
match handle {
|
||||
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
|
||||
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Divide the whole given range of keys into sub-ranges based on the latest
|
||||
/// image layer that covers each range at the specified lsn (inclusive).
|
||||
@@ -919,8 +869,6 @@ impl LayerMap {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -947,15 +895,15 @@ mod tests {
|
||||
|
||||
fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
|
||||
assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
|
||||
let lhs: HashMap<SearchResult, KeySpace> = lhs
|
||||
let lhs: Vec<_> = lhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result, accum.to_keyspace()))
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.collect();
|
||||
let rhs: HashMap<SearchResult, KeySpace> = rhs
|
||||
let rhs: Vec<_> = rhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result, accum.to_keyspace()))
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.collect();
|
||||
|
||||
assert_eq!(lhs, rhs);
|
||||
@@ -975,7 +923,7 @@ mod tests {
|
||||
Some(res) => {
|
||||
range_search_result
|
||||
.found
|
||||
.entry(res)
|
||||
.entry(OrderedSearchResult(res))
|
||||
.or_default()
|
||||
.add_key(key);
|
||||
}
|
||||
|
||||
@@ -294,6 +294,17 @@ pub enum LoadMetadataError {
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||
|
||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -32,7 +32,6 @@ use crate::control_plane_client::{
|
||||
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
@@ -485,7 +484,7 @@ pub async fn init_tenant_mgr(
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf.clone(),
|
||||
location_conf.tenant_conf,
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
@@ -795,7 +794,7 @@ pub(crate) async fn set_new_tenant_config(
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
if tenant.tenant_shard_id().shard_count > ShardCount(0) {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
@@ -806,7 +805,7 @@ pub(crate) async fn set_new_tenant_config(
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
new_tenant_conf,
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
@@ -1377,7 +1376,7 @@ impl TenantManager {
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -1387,10 +1386,11 @@ impl TenantManager {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
|
||||
if new_shard_count <= ShardCount(effective_old_shard_count) {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count();
|
||||
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
|
||||
if !expansion_factor.is_power_of_two() {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
@@ -1467,7 +1467,7 @@ impl TenantManager {
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
shard: child_shard_identity,
|
||||
tenant_conf: parent_tenant_conf.clone(),
|
||||
tenant_conf: parent_tenant_conf,
|
||||
};
|
||||
|
||||
self.upsert_location(
|
||||
@@ -1490,16 +1490,6 @@ impl TenantManager {
|
||||
peek_slot.and_then(|s| s.get_attached()).cloned()
|
||||
};
|
||||
if let Some(t) = child_shard {
|
||||
// Wait for the child shard to become active: this should be very quick because it only
|
||||
// has to download the index_part that we just uploaded when creating it.
|
||||
if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await {
|
||||
// This is not fatal: we have durably created the child shard. It just makes the
|
||||
// split operation less seamless for clients, as we will may detach the parent
|
||||
// shard before the child shards are fully ready to serve requests.
|
||||
tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}");
|
||||
continue;
|
||||
}
|
||||
|
||||
let timelines = t.timelines.lock().unwrap().clone();
|
||||
for timeline in timelines.values() {
|
||||
let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
|
||||
|
||||
@@ -614,7 +614,7 @@ impl RemoteTimelineClient {
|
||||
metadata,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.metric_begin(&op);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
|
||||
|
||||
@@ -654,7 +654,7 @@ impl RemoteTimelineClient {
|
||||
metadata.generation, metadata.shard
|
||||
);
|
||||
let op = UploadOp::UploadLayer(layer, metadata);
|
||||
self.metric_begin(&op);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
|
||||
@@ -823,14 +823,10 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
if with_metadata.is_empty() {
|
||||
// avoid scheduling the op & bumping the metric
|
||||
return;
|
||||
}
|
||||
let op = UploadOp::Delete(Delete {
|
||||
layers: with_metadata,
|
||||
});
|
||||
self.metric_begin(&op);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
|
||||
@@ -1520,10 +1516,10 @@ impl RemoteTimelineClient {
|
||||
.await;
|
||||
}
|
||||
|
||||
self.metric_end(&task.op);
|
||||
self.calls_unfinished_metric_end(&task.op);
|
||||
}
|
||||
|
||||
fn metric_impl(
|
||||
fn calls_unfinished_metric_impl(
|
||||
&self,
|
||||
op: &UploadOp,
|
||||
) -> Option<(
|
||||
@@ -1560,17 +1556,17 @@ impl RemoteTimelineClient {
|
||||
Some(res)
|
||||
}
|
||||
|
||||
fn metric_begin(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
|
||||
fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
|
||||
guard.will_decrement_manually(); // in metric_end(), see right below
|
||||
guard.will_decrement_manually(); // in unfinished_ops_metric_end()
|
||||
}
|
||||
|
||||
fn metric_end(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
|
||||
fn calls_unfinished_metric_end(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
@@ -1655,7 +1651,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Tear down queued ops
|
||||
for op in qi.queued_operations.into_iter() {
|
||||
self.metric_end(&op);
|
||||
self.calls_unfinished_metric_end(&op);
|
||||
// Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
|
||||
// which is exactly what we want to happen.
|
||||
drop(op);
|
||||
|
||||
@@ -81,7 +81,15 @@ pub async fn download_layer_file<'a>(
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
let download = storage
|
||||
.download(&remote_path, cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut destination_file =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
@@ -90,11 +98,9 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
.with_context(|| format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
))
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
match bytes_amount {
|
||||
|
||||
@@ -133,7 +133,7 @@ impl SecondaryTenant {
|
||||
}
|
||||
|
||||
pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
|
||||
*(self.tenant_conf.lock().unwrap()) = config.clone();
|
||||
*(self.tenant_conf.lock().unwrap()) = *config;
|
||||
}
|
||||
|
||||
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
||||
@@ -144,13 +144,13 @@ impl SecondaryTenant {
|
||||
|
||||
let conf = models::LocationConfigSecondary { warm: conf.warm };
|
||||
|
||||
let tenant_conf = self.tenant_conf.lock().unwrap().clone();
|
||||
let tenant_conf = *self.tenant_conf.lock().unwrap();
|
||||
models::LocationConfig {
|
||||
mode: models::LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(conf),
|
||||
shard_number: self.tenant_shard_id.shard_number.0,
|
||||
shard_count: self.tenant_shard_id.shard_count.literal(),
|
||||
shard_count: self.tenant_shard_id.shard_count.0,
|
||||
shard_stripe_size: self.shard_identity.stripe_size.0,
|
||||
tenant_conf: tenant_conf.into(),
|
||||
}
|
||||
|
||||
@@ -8,21 +8,15 @@ pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::models::{
|
||||
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
@@ -40,11 +34,6 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
|
||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use super::layer_map::InMemoryLayerHandle;
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::PageReconstructError;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
@@ -78,287 +67,6 @@ pub struct ValueReconstructState {
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
|
||||
pub(crate) enum ValueReconstructSituation {
|
||||
Complete,
|
||||
#[default]
|
||||
Continue,
|
||||
}
|
||||
|
||||
/// Reconstruct data accumulated for a single key during a vectored get
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub(crate) struct VectoredValueReconstructState {
|
||||
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub(crate) img: Option<(Lsn, Bytes)>,
|
||||
|
||||
situation: ValueReconstructSituation,
|
||||
}
|
||||
|
||||
impl VectoredValueReconstructState {
|
||||
fn get_cached_lsn(&self) -> Option<Lsn> {
|
||||
self.img.as_ref().map(|img| img.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
fn from(mut state: VectoredValueReconstructState) -> Self {
|
||||
// walredo expects the records to be descending in terms of Lsn
|
||||
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
|
||||
|
||||
ValueReconstructState {
|
||||
records: state.records,
|
||||
img: state.img,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Bag of data accumulated during a vectored get
|
||||
pub(crate) struct ValuesReconstructState {
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Associate a key with the error which it encountered and mark it as done
|
||||
pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
|
||||
let previous = self.keys.insert(key, Err(err));
|
||||
if let Some(Ok(state)) = previous {
|
||||
if state.situation == ValueReconstructSituation::Continue {
|
||||
self.keys_done.add_key(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the state collected for a given key.
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
/// If the key is done after the update, mark it as such.
|
||||
pub(crate) fn update_key(
|
||||
&mut self,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
value: Value,
|
||||
) -> ValueReconstructSituation {
|
||||
let state = self
|
||||
.keys
|
||||
.entry(*key)
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
ValueReconstructSituation::Complete => unreachable!(),
|
||||
ValueReconstructSituation::Continue => match value {
|
||||
Value::Image(img) => {
|
||||
state.img = Some((lsn, img));
|
||||
true
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let reached_cache =
|
||||
state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
|
||||
let will_init = rec.will_init();
|
||||
state.records.push((lsn, rec));
|
||||
will_init || reached_cache
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
if key_done && state.situation == ValueReconstructSituation::Continue {
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
|
||||
state.situation
|
||||
} else {
|
||||
ValueReconstructSituation::Complete
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the Lsn at which this key is cached if one exists.
|
||||
/// The read path should go no further than this Lsn for the given key.
|
||||
pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
|
||||
self.keys
|
||||
.get(key)
|
||||
.and_then(|k| k.as_ref().ok())
|
||||
.and_then(|state| state.get_cached_lsn())
|
||||
}
|
||||
|
||||
/// Returns the key space describing the keys that have
|
||||
/// been marked as completed since the last call to this function.
|
||||
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
|
||||
self.keys_done.consume_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ValuesReconstructState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Description of layer to be read - the layer map can turn
|
||||
/// this description into the actual layer.
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub(crate) enum ReadableLayerDesc {
|
||||
Persistent {
|
||||
desc: PersistentLayerDesc,
|
||||
lsn_floor: Lsn,
|
||||
lsn_ceil: Lsn,
|
||||
},
|
||||
InMemory {
|
||||
handle: InMemoryLayerHandle,
|
||||
lsn_ceil: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
|
||||
#[derive(Debug)]
|
||||
struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
||||
|
||||
/// Data structure which maintains a fringe of layers for the
|
||||
/// read path. The fringe is the set of layers which intersects
|
||||
/// the current keyspace that the search is descending on.
|
||||
/// Each layer tracks the keyspace that intersects it.
|
||||
///
|
||||
/// The fringe must appear sorted by Lsn. Hence, it uses
|
||||
/// a two layer indexing scheme.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LayerFringe {
|
||||
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
|
||||
layers: HashMap<ReadableLayerDesc, KeySpace>,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
pub(crate) fn new() -> Self {
|
||||
LayerFringe {
|
||||
layers_by_lsn: BinaryHeap::new(),
|
||||
layers: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
|
||||
let handle = match self.layers_by_lsn.pop() {
|
||||
Some(h) => h,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&handle.0);
|
||||
match removed {
|
||||
Some((layer, keyspace)) => Some((layer, keyspace)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
|
||||
let entry = self.layers.entry(layer.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().merge(&keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.layers_by_lsn
|
||||
.push(ReadableLayerDescOrdered(entry.key().clone()));
|
||||
entry.insert(keyspace);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LayerFringe {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ReadableLayerDescOrdered {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
|
||||
if ord == std::cmp::Ordering::Equal {
|
||||
self.0
|
||||
.get_lsn_floor()
|
||||
.cmp(&other.0.get_lsn_floor())
|
||||
.reverse()
|
||||
} else {
|
||||
ord
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ReadableLayerDescOrdered {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ReadableLayerDescOrdered {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.get_lsn_floor() == other.0.get_lsn_floor()
|
||||
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ReadableLayerDescOrdered {}
|
||||
|
||||
impl ReadableLayerDesc {
|
||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
|
||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
|
||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
layer_manager: &LayerManager,
|
||||
keyspace: KeySpace,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent {
|
||||
desc,
|
||||
lsn_floor,
|
||||
lsn_ceil,
|
||||
} => {
|
||||
let layer = layer_manager.get_from_desc(desc);
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
*lsn_floor,
|
||||
*lsn_ceil,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
||||
let layer = layer_manager
|
||||
.layer_map()
|
||||
.get_in_memory_layer(handle)
|
||||
.unwrap();
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value from [`Layer::get_value_reconstruct_data`]
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
|
||||
@@ -35,18 +35,12 @@ use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
@@ -65,9 +59,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
|
||||
};
|
||||
use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -215,7 +207,6 @@ pub struct DeltaLayerInner {
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
vectored_blob_reader: VectoredBlobReader,
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader,
|
||||
@@ -251,7 +242,7 @@ impl DeltaLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, 0, ctx).await?;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
|
||||
inner.dump(ctx).await
|
||||
}
|
||||
@@ -287,25 +278,20 @@ impl DeltaLayer {
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
max_vectored_read_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(max_vectored_read_size, ctx))
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
.await
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path()))
|
||||
}
|
||||
|
||||
async fn load_inner(
|
||||
&self,
|
||||
max_vectored_read_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<DeltaLayerInner>> {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, None, max_vectored_read_size, ctx)
|
||||
let loaded = DeltaLayerInner::load(&path, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
@@ -706,16 +692,15 @@ impl DeltaLayerInner {
|
||||
pub(super) async fn load(
|
||||
path: &Utf8Path,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let block_reader = FileBlockReader::new(file);
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
let summary_blk = match file.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
@@ -737,16 +722,8 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: don't open file twice
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(file, max_vectored_read_size);
|
||||
|
||||
Ok(Ok(DeltaLayerInner {
|
||||
file: block_reader,
|
||||
vectored_blob_reader,
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
}))
|
||||
@@ -841,174 +818,6 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
// If the key is cached, go no further than the cached Lsn.
|
||||
//
|
||||
// Currently, the index is visited for each range, but this
|
||||
// can be further optimised to visit the index only once.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let reads = self
|
||||
.plan_reads(keyspace, start_lsn..end_lsn, reconstruct_state, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(self.vectored_blob_reader.get_max_read_size());
|
||||
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
|
||||
tree_reader
|
||||
.visit(
|
||||
&start_key.0,
|
||||
VisitDirection::Forwards,
|
||||
|raw_key, value| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
|
||||
assert!(key >= range.start && lsn >= lsn_range.start);
|
||||
|
||||
let cached_lsn = reconstruct_state.get_cached_lsn(&key);
|
||||
let flag = {
|
||||
if cached_lsn >= Some(lsn) {
|
||||
BlobFlag::Ignore
|
||||
} else if blob_ref.will_init() {
|
||||
BlobFlag::Replaces
|
||||
} else {
|
||||
BlobFlag::None
|
||||
}
|
||||
};
|
||||
|
||||
if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
|
||||
planner.handle_range_end(blob_ref.pos());
|
||||
range_end_handled = true;
|
||||
false
|
||||
} else {
|
||||
planner.handle(key, lsn, blob_ref.pos(), flag);
|
||||
true
|
||||
}
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
|
||||
if !range_end_handled {
|
||||
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
tracing::info!("Handling range end fallback at {}", payload_end);
|
||||
planner.handle_range_end(payload_end);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
) {
|
||||
let mut ignore_key_with_err = None;
|
||||
|
||||
let mut buf = Some(BytesMut::with_capacity(
|
||||
self.vectored_blob_reader.get_max_read_size(),
|
||||
));
|
||||
|
||||
for read in reads.into_iter().rev() {
|
||||
let res = self
|
||||
.vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||
.await;
|
||||
|
||||
let blobs_buf = match res {
|
||||
Ok(blobs_buf) => blobs_buf,
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.vectored_blob_reader.get_file_ref().path,
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(
|
||||
self.vectored_blob_reader.get_max_read_size(),
|
||||
));
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
if Some(meta.meta.key) == ignore_key_with_err {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
|
||||
let value = match value {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to deserialize blob from virtual file {}",
|
||||
self.vectored_blob_reader.get_file_ref().path,
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
|
||||
// state, no further updates shall be made to it. The call below will
|
||||
// panic if the invariant is violated.
|
||||
reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
|
||||
@@ -26,25 +26,20 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::repository::{Key, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
@@ -64,7 +59,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::ImageFileName;
|
||||
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
|
||||
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -157,7 +152,6 @@ pub struct ImageLayerInner {
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader,
|
||||
vectored_blob_reader: VectoredBlobReader,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
@@ -214,7 +208,7 @@ impl ImageLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, 0, ctx).await?;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
|
||||
inner.dump(ctx).await?;
|
||||
|
||||
@@ -244,32 +238,21 @@ impl ImageLayer {
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
max_vectored_read_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(max_vectored_read_size, ctx))
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
.await
|
||||
.with_context(|| format!("Failed to load image layer {}", self.path()))
|
||||
}
|
||||
|
||||
async fn load_inner(
|
||||
&self,
|
||||
max_vectored_read_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ImageLayerInner> {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = ImageLayerInner::load(
|
||||
&path,
|
||||
self.desc.image_layer_lsn(),
|
||||
None,
|
||||
max_vectored_read_size,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_owned();
|
||||
@@ -376,15 +359,14 @@ impl ImageLayerInner {
|
||||
path: &Utf8Path,
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let block_reader = FileBlockReader::new(file);
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = match file.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
@@ -410,19 +392,11 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: don't open file twice
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(file, max_vectored_read_size);
|
||||
|
||||
Ok(Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn,
|
||||
file: block_reader,
|
||||
vectored_blob_reader,
|
||||
file,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -464,124 +438,6 @@ impl ImageLayerInner {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let reads = self
|
||||
.plan_reads(keyspace, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(self.vectored_blob_reader.get_max_read_size());
|
||||
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
&search_key,
|
||||
VisitDirection::Forwards,
|
||||
|raw_key, offset| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
assert!(key >= range.start);
|
||||
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
false
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, BlobFlag::None);
|
||||
true
|
||||
}
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
|
||||
|
||||
if !range_end_handled {
|
||||
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
planner.handle_range_end(payload_end);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
) {
|
||||
let mut buf = Some(BytesMut::with_capacity(
|
||||
self.vectored_blob_reader.get_max_read_size(),
|
||||
));
|
||||
for read in reads.into_iter().rev() {
|
||||
let res = self
|
||||
.vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
let img_buf = Bytes::copy_from_slice(&blobs_buf.buf[meta.start..meta.end]);
|
||||
reconstruct_state.update_key(
|
||||
&meta.meta.key,
|
||||
self.lsn,
|
||||
Value::Image(img_buf),
|
||||
);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
}
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.vectored_blob_reader.get_file_ref().path,
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(
|
||||
self.vectored_blob_reader.get_max_read_size(),
|
||||
));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
|
||||
@@ -9,15 +9,13 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord;
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use anyhow::{ensure, Result};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
@@ -27,10 +25,7 @@ use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
use super::{DeltaLayerWriter, ResidentLayer};
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -207,91 +202,6 @@ impl InMemoryLayer {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
// If the key is cached, go no further than the cached Lsn.
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let inner = self.inner.read().await;
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
#[derive(Eq, PartialEq, Ord, PartialOrd)]
|
||||
struct BlockRead {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
block_offset: u64,
|
||||
}
|
||||
|
||||
let mut planned_block_reads = BinaryHeap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
|
||||
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
|
||||
None => self.start_lsn..end_lsn,
|
||||
};
|
||||
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
planned_block_reads.push(BlockRead {
|
||||
key,
|
||||
lsn: *entry_lsn,
|
||||
block_offset: *pos,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
let keyspace_size = keyspace.total_size();
|
||||
|
||||
let mut completed_keys = HashSet::new();
|
||||
while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
|
||||
let block_read = planned_block_reads.pop().unwrap();
|
||||
if completed_keys.contains(&block_read.key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
|
||||
if let Err(e) = buf {
|
||||
reconstruct_state
|
||||
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
|
||||
completed_keys.insert(block_read.key);
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = Value::des(&buf.unwrap());
|
||||
if let Err(e) = value {
|
||||
reconstruct_state
|
||||
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
|
||||
completed_keys.insert(block_read.key);
|
||||
continue;
|
||||
}
|
||||
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
completed_keys.insert(block_read.key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
@@ -336,17 +246,32 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
|
||||
pub(crate) async fn put_value(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
|
||||
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn put_values(
|
||||
&self,
|
||||
values: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
for (key, vals) in values {
|
||||
for (lsn, val) in vals {
|
||||
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn put_value_locked(
|
||||
@@ -354,16 +279,22 @@ impl InMemoryLayer {
|
||||
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
|
||||
let off = {
|
||||
// Avoid doing allocations for "small" values.
|
||||
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
|
||||
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
buf.clear();
|
||||
val.ser_into(&mut buf)?;
|
||||
locked_inner
|
||||
.file
|
||||
.write_blob(
|
||||
buf,
|
||||
&buf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
@@ -391,12 +322,7 @@ impl InMemoryLayer {
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(
|
||||
self.start_lsn < end_lsn,
|
||||
"{} >= {}",
|
||||
self.start_lsn,
|
||||
end_lsn
|
||||
);
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
@@ -17,14 +16,13 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
||||
|
||||
use super::delta_layer::{self, DeltaEntry};
|
||||
use super::image_layer;
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
|
||||
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
||||
ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
|
||||
use utils::generation::Generation;
|
||||
@@ -264,37 +262,6 @@ impl Layer {
|
||||
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let layer = self
|
||||
.0
|
||||
.get_or_maybe_download(true, Some(ctx))
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
|
||||
|
||||
self.0
|
||||
.access_stats
|
||||
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
reconstruct_data,
|
||||
&self.0,
|
||||
ctx,
|
||||
)
|
||||
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Download the layer if evicted.
|
||||
///
|
||||
/// Will not error when the layer is already downloaded.
|
||||
@@ -1210,7 +1177,7 @@ pub(crate) enum EvictionError {
|
||||
|
||||
/// Error internal to the [`LayerInner::get_or_maybe_download`]
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DownloadError {
|
||||
enum DownloadError {
|
||||
#[error("timeline has already shutdown")]
|
||||
TimelineShutdown,
|
||||
#[error("no remote storage configured")]
|
||||
@@ -1307,14 +1274,9 @@ impl DownloadedLayer {
|
||||
owner.desc.key_range.clone(),
|
||||
owner.desc.lsn_range.clone(),
|
||||
));
|
||||
delta_layer::DeltaLayerInner::load(
|
||||
&owner.path,
|
||||
summary,
|
||||
owner.conf.max_vectored_read_size,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Delta))
|
||||
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Delta))
|
||||
} else {
|
||||
let lsn = owner.desc.image_layer_lsn();
|
||||
let summary = Some(image_layer::Summary::expected(
|
||||
@@ -1323,15 +1285,9 @@ impl DownloadedLayer {
|
||||
owner.desc.key_range.clone(),
|
||||
lsn,
|
||||
));
|
||||
image_layer::ImageLayerInner::load(
|
||||
&owner.path,
|
||||
lsn,
|
||||
summary,
|
||||
owner.conf.max_vectored_read_size,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Image))
|
||||
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Image))
|
||||
};
|
||||
|
||||
match res {
|
||||
@@ -1381,29 +1337,6 @@ impl DownloadedLayer {
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
|
||||
Delta(d) => {
|
||||
d.get_values_reconstruct_data(keyspace, start_lsn, end_lsn, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
Image(i) => {
|
||||
i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
use LayerKind::*;
|
||||
match self.get(owner, ctx).await? {
|
||||
|
||||
@@ -15,7 +15,7 @@ use utils::id::TenantId;
|
||||
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
||||
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
||||
/// a unified way to generate layer information like file name.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct PersistentLayerDesc {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
@@ -9,7 +9,6 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -140,8 +139,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
// How many errors we have seen consequtively
|
||||
let mut error_run_count = 0;
|
||||
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
@@ -206,27 +203,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
walredo_mgr.maybe_quiesce(period * 10);
|
||||
}
|
||||
|
||||
// TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
|
||||
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
|
||||
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
warn!(
|
||||
n_seconds=%format_args!("{:.3}",
|
||||
delta.as_secs_f64()),
|
||||
count_accounted,
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds")
|
||||
});
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
use std::{
|
||||
str::FromStr,
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use enumset::EnumSet;
|
||||
use tracing::error;
|
||||
|
||||
use crate::{context::RequestContext, task_mgr::TaskKind};
|
||||
|
||||
/// Throttle for `async` functions.
|
||||
///
|
||||
/// Runtime reconfigurable.
|
||||
///
|
||||
/// To share a throttle among multiple entities, wrap it in an [`Arc`].
|
||||
///
|
||||
/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
|
||||
pub struct Throttle<M: Metric> {
|
||||
inner: ArcSwap<Inner>,
|
||||
metric: M,
|
||||
/// will be turned into [`Stats::count_accounted`]
|
||||
count_accounted: AtomicU64,
|
||||
/// will be turned into [`Stats::count_throttled`]
|
||||
count_throttled: AtomicU64,
|
||||
/// will be turned into [`Stats::sum_throttled_usecs`]
|
||||
sum_throttled_usecs: AtomicU64,
|
||||
}
|
||||
|
||||
pub struct Inner {
|
||||
task_kinds: EnumSet<TaskKind>,
|
||||
rate_limiter: Arc<leaky_bucket::RateLimiter>,
|
||||
config: Config,
|
||||
}
|
||||
|
||||
pub type Config = pageserver_api::models::ThrottleConfig;
|
||||
|
||||
pub struct Observation {
|
||||
pub wait_time: Duration,
|
||||
}
|
||||
pub trait Metric {
|
||||
fn observe_throttling(&self, observation: &Observation);
|
||||
}
|
||||
|
||||
/// See [`Throttle::reset_stats`].
|
||||
pub struct Stats {
|
||||
// Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
|
||||
pub count_accounted: u64,
|
||||
// Subset of the `accounted` requests that were actually throttled.
|
||||
// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
|
||||
pub count_throttled: u64,
|
||||
// Sum of microseconds that throttled requests spent waiting for throttling.
|
||||
pub sum_throttled_usecs: u64,
|
||||
}
|
||||
|
||||
impl<M> Throttle<M>
|
||||
where
|
||||
M: Metric,
|
||||
{
|
||||
pub fn new(config: Config, metric: M) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
|
||||
metric,
|
||||
count_accounted: AtomicU64::new(0),
|
||||
count_throttled: AtomicU64::new(0),
|
||||
sum_throttled_usecs: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
fn new_inner(config: Config) -> Inner {
|
||||
let Config {
|
||||
task_kinds,
|
||||
initial,
|
||||
refill_interval,
|
||||
refill_amount,
|
||||
max,
|
||||
fair,
|
||||
} = &config;
|
||||
let task_kinds: EnumSet<TaskKind> = task_kinds
|
||||
.iter()
|
||||
.filter_map(|s| match TaskKind::from_str(s) {
|
||||
Ok(v) => Some(v),
|
||||
Err(e) => {
|
||||
// TODO: avoid this failure mode
|
||||
error!(
|
||||
"cannot parse task kind, ignoring for rate limiting {}",
|
||||
utils::error::report_compact_sources(&e)
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
Inner {
|
||||
task_kinds,
|
||||
rate_limiter: Arc::new(
|
||||
leaky_bucket::RateLimiter::builder()
|
||||
.initial(*initial)
|
||||
.interval(*refill_interval)
|
||||
.refill(refill_amount.get())
|
||||
.max(*max)
|
||||
.fair(*fair)
|
||||
.build(),
|
||||
),
|
||||
config,
|
||||
}
|
||||
}
|
||||
pub fn reconfigure(&self, config: Config) {
|
||||
self.inner.store(Arc::new(Self::new_inner(config)));
|
||||
}
|
||||
|
||||
/// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling.
|
||||
/// This method allows retrieving & resetting that flag.
|
||||
/// Useful for periodic reporting.
|
||||
pub fn reset_stats(&self) -> Stats {
|
||||
let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
|
||||
let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
|
||||
let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
|
||||
Stats {
|
||||
count_accounted,
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Config::steady_rps`].
|
||||
pub fn steady_rps(&self) -> f64 {
|
||||
self.inner.load().config.steady_rps()
|
||||
}
|
||||
|
||||
pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
if !inner.task_kinds.contains(ctx.task_kind()) {
|
||||
return;
|
||||
};
|
||||
let start = std::time::Instant::now();
|
||||
let mut did_throttle = false;
|
||||
let acquire = inner.rate_limiter.acquire(key_count);
|
||||
// turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
|
||||
let acquire = tokio::task::unconstrained(acquire);
|
||||
let mut acquire = std::pin::pin!(acquire);
|
||||
std::future::poll_fn(|cx| {
|
||||
use std::future::Future;
|
||||
let poll = acquire.as_mut().poll(cx);
|
||||
did_throttle = did_throttle || poll.is_pending();
|
||||
poll
|
||||
})
|
||||
.await;
|
||||
self.count_accounted.fetch_add(1, Ordering::Relaxed);
|
||||
if did_throttle {
|
||||
self.count_throttled.fetch_add(1, Ordering::Relaxed);
|
||||
let now = Instant::now();
|
||||
let wait_time = now - start;
|
||||
self.sum_throttled_usecs
|
||||
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
|
||||
let observation = Observation { wait_time };
|
||||
self.metric.observe_throttling(&observation);
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -419,7 +419,6 @@ impl DeleteTimelineFlow {
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client,
|
||||
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
|
||||
},
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
|
||||
@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
filtered_records = 0;
|
||||
|
||||
//
|
||||
// We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
|
||||
// layer size can become much larger than `checkpoint_distance`.
|
||||
// It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
|
||||
// amount of data to key-value storage. So performing this check only after processing
|
||||
// all WAL records in the chunk, can cause huge L0 layer files.
|
||||
//
|
||||
timeline
|
||||
.check_checkpoint_distance()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to check checkpoint distance for timeline {}",
|
||||
timeline.timeline_id
|
||||
)
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,6 +406,16 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
}
|
||||
|
||||
timeline
|
||||
.check_checkpoint_distance()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to check checkpoint distance for timeline {}",
|
||||
timeline.timeline_id
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let timeline_remote_consistent_lsn = timeline
|
||||
.get_remote_consistent_lsn_visible()
|
||||
|
||||
@@ -1,412 +0,0 @@
|
||||
//!
|
||||
//! Utilities for vectored reading of variable-sized "blobs".
|
||||
//!
|
||||
//! The "blob" api is an abstraction on top of the "block" api,
|
||||
//! with the main difference being that blobs do not have a fixed
|
||||
//! size (each blob is prefixed with 1 or 4 byte length field)
|
||||
//!
|
||||
//! The vectored apis provided in this module allow for planning
|
||||
//! and executing disk IO which covers multiple blobs.
|
||||
//!
|
||||
//! Reads are planned with [`VectoredReadPlanner`] which will coalesce
|
||||
//! adjacent blocks into a single disk IO request and exectuted by
|
||||
//! [`VectoredBlobReader`] which does all the required offset juggling
|
||||
//! and returns a buffer housing all the blobs and a list of offsets.
|
||||
//!
|
||||
//! Note that the vectored blob api does *not* go through the page cache.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::vec_map::VecMap;
|
||||
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
/// Metadata bundled with the start and end offset of a blob.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct BlobMeta {
|
||||
pub key: Key,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]
|
||||
pub struct VectoredBlob {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
pub meta: BlobMeta,
|
||||
}
|
||||
|
||||
/// Return type of [`VectoredBlobReader::read_blobs`]
|
||||
pub struct VectoredBlobsBuf {
|
||||
/// Buffer for all blobs in this read
|
||||
pub buf: BytesMut,
|
||||
/// Offsets into the buffer and metadata for all blobs in this read
|
||||
pub blobs: Vec<VectoredBlob>,
|
||||
}
|
||||
|
||||
/// Description of one disk read for multiple blobs.
|
||||
/// Used as the argument form [`VectoredBlobReader::read_blobs`]
|
||||
#[derive(Debug)]
|
||||
pub struct VectoredRead {
|
||||
pub start: u64,
|
||||
pub end: u64,
|
||||
/// Starting offsets and metadata for each blob in this read
|
||||
pub blobs_at: VecMap<u64, BlobMeta>,
|
||||
|
||||
max_read_size: usize,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum VectoredReadExtended {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
impl VectoredRead {
|
||||
fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
.append(start_offset, meta)
|
||||
.expect("First insertion always succeeds");
|
||||
|
||||
Self {
|
||||
start: start_offset,
|
||||
end: end_offset,
|
||||
blobs_at,
|
||||
max_read_size,
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to extend the current read with a new blob if the start
|
||||
/// offset matches with the current end of the vectored read
|
||||
/// and the resuting size is below the max read size
|
||||
fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
let size = (end - start) as usize;
|
||||
if self.end == start && self.size() + size <= self.max_read_size {
|
||||
self.end = end;
|
||||
self.blobs_at
|
||||
.append(start, meta)
|
||||
.expect("LSNs are ordered within vectored reads");
|
||||
|
||||
return VectoredReadExtended::Yes;
|
||||
}
|
||||
|
||||
VectoredReadExtended::No
|
||||
}
|
||||
|
||||
fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum BlobFlag {
|
||||
None,
|
||||
Ignore,
|
||||
Replaces,
|
||||
}
|
||||
|
||||
/// Planner for vectored blob reads.
|
||||
///
|
||||
/// Blob offsets are received via [`VectoredReadPlanner::handle`]
|
||||
/// and coalesced into disk reads.
|
||||
///
|
||||
/// The implementation is very simple:
|
||||
/// * Collect all blob offsets in an ordered structure
|
||||
/// * Iterate over the collected blobs and coalesce them into reads at the end
|
||||
pub struct VectoredReadPlanner {
|
||||
// Track all the blob offsets. Start offsets must be ordered.
|
||||
blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
|
||||
// Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
|
||||
prev: Option<(Key, Lsn, u64, BlobFlag)>,
|
||||
|
||||
max_read_size: usize,
|
||||
}
|
||||
|
||||
impl VectoredReadPlanner {
|
||||
pub fn new(max_read_size: usize) -> Self {
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size,
|
||||
}
|
||||
}
|
||||
|
||||
/// Include a new blob in the read plan.
|
||||
///
|
||||
/// Notes:
|
||||
/// * This function should be called for each blob in the desired *inclusive* range.
|
||||
/// See `DeltaLayerInner::plan_reads` and `ImageLayerInner::plan_reads`.
|
||||
/// * Calls to this function should be for monotonically continuous (key, lsn) tuples.
|
||||
///
|
||||
/// The `flag` argument has two interesting values:
|
||||
/// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
|
||||
/// This is used for WAL records that `will_init`.
|
||||
/// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
|
||||
/// if the blob is cached.
|
||||
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
|
||||
// Implementation note: internally lag behind by one blob such that
|
||||
// we have a start and end offset when initialising [`VectoredRead`]
|
||||
let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
|
||||
None => {
|
||||
self.prev = Some((key, lsn, offset, flag));
|
||||
return;
|
||||
}
|
||||
Some(prev) => prev,
|
||||
};
|
||||
|
||||
self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
|
||||
|
||||
self.prev = Some((key, lsn, offset, flag));
|
||||
}
|
||||
|
||||
pub fn handle_range_end(&mut self, offset: u64) {
|
||||
if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
|
||||
self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
|
||||
}
|
||||
|
||||
self.prev = None;
|
||||
}
|
||||
|
||||
fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
|
||||
match flag {
|
||||
BlobFlag::None => {
|
||||
let blobs_for_key = self.blobs.entry(key).or_default();
|
||||
blobs_for_key.push((lsn, start_offset, end_offset));
|
||||
}
|
||||
BlobFlag::Replaces => {
|
||||
let blobs_for_key = self.blobs.entry(key).or_default();
|
||||
blobs_for_key.clear();
|
||||
blobs_for_key.push((lsn, start_offset, end_offset));
|
||||
}
|
||||
BlobFlag::Ignore => {}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Vec<VectoredRead> {
|
||||
let mut current_read: Option<VectoredRead> = None;
|
||||
let mut reads = Vec::new();
|
||||
|
||||
for (key, blobs_for_key) in self.blobs {
|
||||
for (lsn, start_offset, end_offset) in blobs_for_key {
|
||||
let extended = match &mut current_read {
|
||||
Some(read) => read.extend(start_offset, end_offset, BlobMeta { key, lsn }),
|
||||
None => VectoredReadExtended::No,
|
||||
};
|
||||
|
||||
if extended == VectoredReadExtended::No {
|
||||
let next_read = VectoredRead::new(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
self.max_read_size,
|
||||
);
|
||||
|
||||
let prev_read = current_read.replace(next_read);
|
||||
|
||||
if let Some(read) = prev_read {
|
||||
reads.push(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(read) = current_read {
|
||||
reads.push(read);
|
||||
}
|
||||
|
||||
reads
|
||||
}
|
||||
}
|
||||
|
||||
/// Disk reader for vectored blob spans (does not go through the page cache)
|
||||
pub struct VectoredBlobReader {
|
||||
file: VirtualFile,
|
||||
max_vectored_read_size: usize,
|
||||
}
|
||||
|
||||
impl VectoredBlobReader {
|
||||
pub fn new(file: VirtualFile, max_vectored_read_size: usize) -> Self {
|
||||
Self {
|
||||
file,
|
||||
max_vectored_read_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_max_read_size(&self) -> usize {
|
||||
self.max_vectored_read_size
|
||||
}
|
||||
|
||||
pub fn get_file_ref(&self) -> &VirtualFile {
|
||||
&self.file
|
||||
}
|
||||
|
||||
/// Read the requested blobs into the buffer.
|
||||
///
|
||||
/// We have to deal with the fact that blobs are not fixed size.
|
||||
/// Each blob is prefixed by a size header.
|
||||
///
|
||||
/// The success return value is a struct which contains the buffer
|
||||
/// filled from disk and a list of offsets at which each blob lies
|
||||
/// in the buffer.
|
||||
pub async fn read_blobs(
|
||||
&self,
|
||||
read: &VectoredRead,
|
||||
buf: BytesMut,
|
||||
) -> Result<VectoredBlobsBuf, std::io::Error> {
|
||||
// tracing::info!("read_blobs(read={read:?}, read_size={})", read.size());
|
||||
|
||||
assert!(read.size() > 0);
|
||||
assert!(
|
||||
read.size() <= buf.capacity(),
|
||||
"{} > {}",
|
||||
read.size(),
|
||||
buf.capacity()
|
||||
);
|
||||
let buf = self
|
||||
.file
|
||||
.read_exact_at_n(buf, read.start, read.size())
|
||||
.await?;
|
||||
|
||||
let blobs_at = read.blobs_at.as_slice();
|
||||
let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
|
||||
|
||||
let mut metas = Vec::new();
|
||||
let pairs = blobs_at.iter().zip(
|
||||
blobs_at
|
||||
.iter()
|
||||
.map(Some)
|
||||
.skip(1)
|
||||
.chain(std::iter::once(None)),
|
||||
);
|
||||
for ((offset, meta), next) in pairs {
|
||||
let offset_in_buf = offset - start_offset;
|
||||
let first_len_byte = buf[offset_in_buf as usize];
|
||||
|
||||
// Each blob is prefixed by a header containing it's size.
|
||||
// Extract the size and skip that header to find the start of the data.
|
||||
let (size_length, blob_size) = if first_len_byte < 0x80 {
|
||||
(1, first_len_byte as u64)
|
||||
} else {
|
||||
let mut blob_size_buf = [0u8; 4];
|
||||
let offset_in_buf = offset_in_buf as usize;
|
||||
|
||||
blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
|
||||
blob_size_buf[0] &= 0x7f;
|
||||
(4, u32::from_be_bytes(blob_size_buf) as u64)
|
||||
};
|
||||
|
||||
let start = offset_in_buf + size_length;
|
||||
let end = match next {
|
||||
Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
|
||||
None => start + blob_size,
|
||||
};
|
||||
|
||||
assert_eq!(end - start, blob_size);
|
||||
|
||||
metas.push(VectoredBlob {
|
||||
start: start as usize,
|
||||
end: end as usize,
|
||||
meta: *meta,
|
||||
})
|
||||
}
|
||||
|
||||
Ok(VectoredBlobsBuf { buf, blobs: metas })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
|
||||
assert_eq!(read.start, offset_range.first().unwrap().2);
|
||||
|
||||
let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
|
||||
|
||||
let offsets_in_read: Vec<_> = read
|
||||
.blobs_at
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(offset, _)| *offset)
|
||||
.collect();
|
||||
|
||||
assert_eq!(expected_offsets_in_read, offsets_in_read);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn planner_max_read_size_test() {
|
||||
let max_read_size = 128 * 1024;
|
||||
let key = Key::MIN;
|
||||
let lsn = Lsn(0);
|
||||
|
||||
let blob_descriptions = vec![
|
||||
(key, lsn, 0, BlobFlag::None),
|
||||
(key, lsn, 32 * 1024, BlobFlag::None),
|
||||
(key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
|
||||
(key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
|
||||
(key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
|
||||
(key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
|
||||
(key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
|
||||
(key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
|
||||
];
|
||||
|
||||
let ranges = [
|
||||
&blob_descriptions[0..3],
|
||||
&blob_descriptions[3..4],
|
||||
&blob_descriptions[4..5],
|
||||
&blob_descriptions[5..6],
|
||||
&blob_descriptions[6..7],
|
||||
&blob_descriptions[7..],
|
||||
];
|
||||
|
||||
let mut planner = VectoredReadPlanner::new(max_read_size);
|
||||
for (key, lsn, offset, flag) in blob_descriptions.clone() {
|
||||
planner.handle(key, lsn, offset, flag);
|
||||
}
|
||||
|
||||
planner.handle_range_end(652 * 1024);
|
||||
|
||||
let reads = planner.finish();
|
||||
assert_eq!(reads.len(), 6);
|
||||
|
||||
for (idx, read) in reads.iter().enumerate() {
|
||||
validate_read(read, ranges[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn planner_replacement_test() {
|
||||
let max_read_size = 128 * 1024;
|
||||
let first_key = Key::MIN;
|
||||
let second_key = first_key.next();
|
||||
let lsn = Lsn(0);
|
||||
|
||||
let blob_descriptions = vec![
|
||||
(first_key, lsn, 0, BlobFlag::None), // First in read 1
|
||||
(first_key, lsn, 1024, BlobFlag::None), // Last in read 1
|
||||
(second_key, lsn, 2 * 1024, BlobFlag::Replaces),
|
||||
(second_key, lsn, 3 * 1024, BlobFlag::None),
|
||||
(second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
|
||||
(second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2
|
||||
];
|
||||
|
||||
let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
|
||||
|
||||
let mut planner = VectoredReadPlanner::new(max_read_size);
|
||||
for (key, lsn, offset, flag) in blob_descriptions.clone() {
|
||||
planner.handle(key, lsn, offset, flag);
|
||||
}
|
||||
|
||||
planner.handle_range_end(6 * 1024);
|
||||
|
||||
let reads = planner.finish();
|
||||
assert_eq!(reads.len(), 2);
|
||||
|
||||
for (idx, read) in reads.iter().enumerate() {
|
||||
validate_read(read, ranges[idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -562,18 +562,7 @@ impl VirtualFile {
|
||||
B: IoBufMut + Send,
|
||||
{
|
||||
let (buf, res) =
|
||||
read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
|
||||
res.map(|()| buf)
|
||||
}
|
||||
|
||||
pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
|
||||
where
|
||||
B: IoBufMut + Send,
|
||||
{
|
||||
let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
|
||||
self.read_at(buf, offset)
|
||||
})
|
||||
.await;
|
||||
read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
|
||||
res.map(|()| buf)
|
||||
}
|
||||
|
||||
@@ -707,7 +696,6 @@ impl VirtualFile {
|
||||
pub async fn read_exact_at_impl<B, F, Fut>(
|
||||
buf: B,
|
||||
mut offset: u64,
|
||||
count: Option<usize>,
|
||||
mut read_at: F,
|
||||
) -> (B, std::io::Result<()>)
|
||||
where
|
||||
@@ -715,15 +703,7 @@ where
|
||||
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
|
||||
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
|
||||
{
|
||||
let mut buf: tokio_epoll_uring::Slice<B> = match count {
|
||||
Some(count) => {
|
||||
assert!(count <= buf.bytes_total());
|
||||
assert!(count > 0);
|
||||
buf.slice(..count) // may include uninitialized memory
|
||||
}
|
||||
None => buf.slice_full(), // includes all the uninitialized memory
|
||||
};
|
||||
|
||||
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
|
||||
while buf.bytes_total() != 0 {
|
||||
let res;
|
||||
(buf, res) = read_at(buf, offset).await;
|
||||
@@ -813,7 +793,7 @@ mod test_read_exact_at_impl {
|
||||
result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
|
||||
}]),
|
||||
}));
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
@@ -822,33 +802,13 @@ mod test_read_exact_at_impl {
|
||||
assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_with_count() {
|
||||
let buf = Vec::with_capacity(5);
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![Expectation {
|
||||
offset: 0,
|
||||
bytes_total: 3,
|
||||
result: Ok(vec![b'a', b'b', b'c']),
|
||||
}]),
|
||||
}));
|
||||
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
.await;
|
||||
assert!(res.is_ok());
|
||||
assert_eq!(buf, vec![b'a', b'b', b'c']);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_buf_issues_no_syscall() {
|
||||
let buf = Vec::new();
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::new(),
|
||||
}));
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
@@ -873,7 +833,7 @@ mod test_read_exact_at_impl {
|
||||
},
|
||||
]),
|
||||
}));
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
@@ -904,7 +864,7 @@ mod test_read_exact_at_impl {
|
||||
},
|
||||
]),
|
||||
}));
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
|
||||
@@ -9,7 +9,7 @@ use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::os::{fd::AsRawFd, unix::process::CommandExt};
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
@@ -26,6 +26,40 @@ mod no_leak_child;
|
||||
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
|
||||
mod protocol;
|
||||
|
||||
///
|
||||
/// Command with ability not to give all file descriptors to child process
|
||||
///
|
||||
trait CloseFileDescriptors: CommandExt {
|
||||
///
|
||||
/// Close file descriptors (other than stdin, stdout, stderr) in child process
|
||||
///
|
||||
fn close_fds(&mut self) -> &mut Command;
|
||||
}
|
||||
|
||||
impl<C: CommandExt> CloseFileDescriptors for C {
|
||||
fn close_fds(&mut self) -> &mut Command {
|
||||
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
|
||||
// which means it should be safe to execute inside a signal handler.
|
||||
// The precise meaning depends on platform. See `man signal-safety`
|
||||
// for the linux definition.
|
||||
//
|
||||
// The set_fds_cloexec_threadsafe function is documented to be
|
||||
// async-signal-safe.
|
||||
//
|
||||
// Aside from this function, the rest of the code is re-entrant and
|
||||
// doesn't make any syscalls. We're just passing constants.
|
||||
//
|
||||
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
|
||||
// which is not async-signal-safe. Be careful.
|
||||
unsafe {
|
||||
self.pre_exec(move || {
|
||||
close_fds::set_fds_cloexec_threadsafe(3, &[]);
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
@@ -79,14 +113,16 @@ impl WalRedoProcess {
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
// The redo process is not trusted, and runs in seccomp mode that
|
||||
// doesn't allow it to open any files. We have to also make sure it
|
||||
// doesn't inherit any file descriptors from the pageserver, that
|
||||
// would allow an attacker to read any files that happen to be open
|
||||
// in the pageserver.
|
||||
//
|
||||
// The Rust standard library makes sure to mark any file descriptors with
|
||||
// as close-on-exec by default, but that's not enough, since we use
|
||||
// libraries that directly call libc open without setting that flag.
|
||||
.close_fds()
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
|
||||
256
poetry.lock
generated
256
poetry.lock
generated
@@ -158,28 +158,6 @@ files = [
|
||||
attrs = ">=16.0.0"
|
||||
pluggy = ">=0.4.0"
|
||||
|
||||
[[package]]
|
||||
name = "anyio"
|
||||
version = "4.3.0"
|
||||
description = "High level compatibility layer for multiple asynchronous event loop implementations"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
|
||||
{file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
|
||||
idna = ">=2.8"
|
||||
sniffio = ">=1.1"
|
||||
typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
|
||||
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
|
||||
trio = ["trio (>=0.23)"]
|
||||
|
||||
[[package]]
|
||||
name = "async-timeout"
|
||||
version = "4.0.3"
|
||||
@@ -858,43 +836,43 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "42.0.2"
|
||||
version = "42.0.0"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
|
||||
{file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
|
||||
{file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
|
||||
{file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
|
||||
{file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
|
||||
{file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
|
||||
{file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
|
||||
{file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
|
||||
{file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
|
||||
{file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
|
||||
{file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
|
||||
{file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
|
||||
{file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -1095,100 +1073,6 @@ files = [
|
||||
{file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h11"
|
||||
version = "0.14.0"
|
||||
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
|
||||
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "4.1.0"
|
||||
description = "HTTP/2 State-Machine based protocol implementation"
|
||||
optional = false
|
||||
python-versions = ">=3.6.1"
|
||||
files = [
|
||||
{file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
|
||||
{file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
hpack = ">=4.0,<5"
|
||||
hyperframe = ">=6.0,<7"
|
||||
|
||||
[[package]]
|
||||
name = "hpack"
|
||||
version = "4.0.0"
|
||||
description = "Pure-Python HPACK header compression"
|
||||
optional = false
|
||||
python-versions = ">=3.6.1"
|
||||
files = [
|
||||
{file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
|
||||
{file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httpcore"
|
||||
version = "1.0.3"
|
||||
description = "A minimal low-level HTTP client."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"},
|
||||
{file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
certifi = "*"
|
||||
h11 = ">=0.13,<0.15"
|
||||
|
||||
[package.extras]
|
||||
asyncio = ["anyio (>=4.0,<5.0)"]
|
||||
http2 = ["h2 (>=3,<5)"]
|
||||
socks = ["socksio (==1.*)"]
|
||||
trio = ["trio (>=0.22.0,<0.24.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "httpx"
|
||||
version = "0.26.0"
|
||||
description = "The next generation HTTP client."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
|
||||
{file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
anyio = "*"
|
||||
certifi = "*"
|
||||
h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
|
||||
httpcore = "==1.*"
|
||||
idna = "*"
|
||||
sniffio = "*"
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotli", "brotlicffi"]
|
||||
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
|
||||
http2 = ["h2 (>=3,<5)"]
|
||||
socks = ["socksio (==1.*)"]
|
||||
|
||||
[[package]]
|
||||
name = "hyperframe"
|
||||
version = "6.0.1"
|
||||
description = "HTTP/2 framing layer for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.6.1"
|
||||
files = [
|
||||
{file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
|
||||
{file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.3"
|
||||
@@ -2025,20 +1909,6 @@ pytest = [
|
||||
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-repeat"
|
||||
version = "0.9.3"
|
||||
description = "pytest plugin for repeating tests"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
|
||||
{file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-rerunfailures"
|
||||
version = "13.0"
|
||||
@@ -2182,6 +2052,7 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -2271,28 +2142,28 @@ pyasn1 = ">=0.1.3"
|
||||
|
||||
[[package]]
|
||||
name = "ruff"
|
||||
version = "0.2.2"
|
||||
version = "0.1.11"
|
||||
description = "An extremely fast Python linter and code formatter, written in Rust."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
|
||||
{file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
|
||||
{file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
|
||||
{file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
|
||||
{file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
|
||||
{file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
|
||||
{file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
|
||||
{file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
|
||||
{file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
|
||||
{file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
|
||||
{file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
|
||||
{file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
|
||||
{file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
|
||||
{file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
|
||||
{file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
|
||||
{file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
|
||||
{file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
|
||||
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
|
||||
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
|
||||
{file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
|
||||
{file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
|
||||
{file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
|
||||
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
|
||||
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
|
||||
{file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
|
||||
{file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
|
||||
{file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
|
||||
{file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
|
||||
{file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
|
||||
{file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
|
||||
{file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
|
||||
{file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
|
||||
{file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
|
||||
{file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2354,17 +2225,6 @@ files = [
|
||||
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sniffio"
|
||||
version = "1.3.0"
|
||||
description = "Sniff out which async library your code is running under"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
|
||||
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sshpubkeys"
|
||||
version = "3.3.1"
|
||||
@@ -2571,6 +2431,16 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -2808,4 +2678,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
|
||||
content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
|
||||
|
||||
@@ -171,8 +171,16 @@ async fn task_main(
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
info!(%peer_addr, "serving");
|
||||
let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
|
||||
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
|
||||
let mut ctx =
|
||||
RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
|
||||
handle_client(
|
||||
&mut ctx,
|
||||
dest_suffix,
|
||||
tls_config,
|
||||
tls_server_end_point,
|
||||
socket,
|
||||
)
|
||||
.await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
@@ -240,7 +248,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
|
||||
async fn handle_client(
|
||||
mut ctx: RequestMonitoring,
|
||||
ctx: &mut RequestMonitoring,
|
||||
dest_suffix: Arc<String>,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
|
||||
@@ -87,22 +87,6 @@ pub mod errors {
|
||||
impl ReportableError for ApiError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::LOCKED,
|
||||
text,
|
||||
} if text.contains("quota exceeded")
|
||||
|| text.contains("the limit for current plan reached") =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::TOO_MANY_REQUESTS,
|
||||
..
|
||||
} => crate::error::ErrorKind::ServiceRateLimit,
|
||||
ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
|
||||
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
|
||||
}
|
||||
@@ -238,7 +222,7 @@ pub mod errors {
|
||||
match self {
|
||||
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
|
||||
WakeComputeError::ApiError(e) => e.get_error_kind(),
|
||||
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
|
||||
WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,13 +147,15 @@ impl RequestMonitoring {
|
||||
self.success = true;
|
||||
}
|
||||
|
||||
pub fn log(self) {}
|
||||
}
|
||||
|
||||
impl Drop for RequestMonitoring {
|
||||
fn drop(&mut self) {
|
||||
pub fn log(&mut self) {
|
||||
if let Some(tx) = self.sender.take() {
|
||||
let _: Result<(), _> = tx.send(self.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RequestMonitoring {
|
||||
fn drop(&mut self) {
|
||||
self.log()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,12 +37,9 @@ pub enum ErrorKind {
|
||||
/// Network error between user and proxy. Not necessarily user error
|
||||
ClientDisconnect,
|
||||
|
||||
/// Proxy self-imposed user rate limits
|
||||
/// Proxy self-imposed rate limits
|
||||
RateLimit,
|
||||
|
||||
/// Proxy self-imposed service-wise rate limits
|
||||
ServiceRateLimit,
|
||||
|
||||
/// internal errors
|
||||
Service,
|
||||
|
||||
@@ -57,12 +54,25 @@ pub enum ErrorKind {
|
||||
}
|
||||
|
||||
impl ErrorKind {
|
||||
pub fn to_str(&self) -> &'static str {
|
||||
match self {
|
||||
ErrorKind::User => "request failed due to user error",
|
||||
ErrorKind::ClientDisconnect => "client disconnected",
|
||||
ErrorKind::RateLimit => "request cancelled due to rate limit",
|
||||
ErrorKind::Service => "internal service error",
|
||||
ErrorKind::ControlPlane => "non-retryable control plane error",
|
||||
ErrorKind::Postgres => "postgres error",
|
||||
ErrorKind::Compute => {
|
||||
"non-retryable compute connection error (or exhausted retry capacity)"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_metric_label(&self) -> &'static str {
|
||||
match self {
|
||||
ErrorKind::User => "user",
|
||||
ErrorKind::ClientDisconnect => "clientdisconnect",
|
||||
ErrorKind::RateLimit => "ratelimit",
|
||||
ErrorKind::ServiceRateLimit => "serviceratelimit",
|
||||
ErrorKind::Service => "service",
|
||||
ErrorKind::ControlPlane => "controlplane",
|
||||
ErrorKind::Postgres => "postgres",
|
||||
@@ -75,6 +85,12 @@ pub trait ReportableError: fmt::Display + Send + 'static {
|
||||
fn get_error_kind(&self) -> ErrorKind;
|
||||
}
|
||||
|
||||
impl ReportableError for tokio::time::error::Elapsed {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
ErrorKind::RateLimit
|
||||
}
|
||||
}
|
||||
|
||||
impl ReportableError for tokio_postgres::error::Error {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
if self.as_db_error().is_some() {
|
||||
|
||||
@@ -132,8 +132,9 @@ struct Scram(scram::ServerSecret);
|
||||
|
||||
impl Scram {
|
||||
fn new(password: &str) -> anyhow::Result<Self> {
|
||||
let secret =
|
||||
scram::ServerSecret::build(password).context("failed to generate scram secret")?;
|
||||
let salt = rand::random::<[u8; 16]>();
|
||||
let secret = scram::ServerSecret::build(password, &salt, 256)
|
||||
.context("failed to generate scram secret")?;
|
||||
Ok(Scram(secret))
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,9 @@ mod messages;
|
||||
mod secret;
|
||||
mod signature;
|
||||
|
||||
#[cfg(any(test, doc))]
|
||||
mod password;
|
||||
|
||||
pub use exchange::{exchange, Exchange};
|
||||
pub use key::ScramKey;
|
||||
pub use secret::ServerSecret;
|
||||
@@ -56,21 +59,27 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
|
||||
|
||||
use crate::sasl::{Mechanism, Step};
|
||||
|
||||
use super::{Exchange, ServerSecret};
|
||||
use super::{password::SaltedPassword, Exchange, ServerSecret};
|
||||
|
||||
#[test]
|
||||
fn snapshot() {
|
||||
fn happy_path() {
|
||||
let iterations = 4096;
|
||||
let salt = "QSXCR+Q6sek8bf92";
|
||||
let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
|
||||
let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
|
||||
let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
|
||||
let secret = ServerSecret::parse(&secret).unwrap();
|
||||
let salt_base64 = "QSXCR+Q6sek8bf92";
|
||||
let pw = SaltedPassword::new(
|
||||
b"pencil",
|
||||
base64::decode(salt_base64).unwrap().as_slice(),
|
||||
iterations,
|
||||
);
|
||||
|
||||
let secret = ServerSecret {
|
||||
iterations,
|
||||
salt_base64: salt_base64.to_owned(),
|
||||
stored_key: pw.client_key().sha256(),
|
||||
server_key: pw.server_key(),
|
||||
doomed: false,
|
||||
};
|
||||
const NONCE: [u8; 18] = [
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
|
||||
];
|
||||
@@ -112,33 +121,4 @@ mod tests {
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
fn run_round_trip_test(server_password: &str, client_password: &str) {
|
||||
let scram_secret = ServerSecret::build(server_password).unwrap();
|
||||
let sasl_client =
|
||||
ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
|
||||
|
||||
let outcome = super::exchange(
|
||||
&scram_secret,
|
||||
sasl_client,
|
||||
crate::config::TlsServerEndPoint::Undefined,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
match outcome {
|
||||
crate::sasl::Outcome::Success(_) => {}
|
||||
crate::sasl::Outcome::Failure(r) => panic!("{r}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_trip() {
|
||||
run_round_trip_test("pencil", "pencil")
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "password doesn't match")]
|
||||
fn failure() {
|
||||
run_round_trip_test("pencil", "eraser")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
/// Faithfully taken from PostgreSQL.
|
||||
pub const SCRAM_KEY_LEN: usize = 32;
|
||||
|
||||
/// One of the keys derived from the user's password.
|
||||
/// One of the keys derived from the [password](super::password::SaltedPassword).
|
||||
/// We use the same structure for all keys, i.e.
|
||||
/// `ClientKey`, `StoredKey`, and `ServerKey`.
|
||||
#[derive(Clone, Default, PartialEq, Eq, Debug)]
|
||||
|
||||
74
proxy/src/scram/password.rs
Normal file
74
proxy/src/scram/password.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
//! Password hashing routines.
|
||||
|
||||
use super::key::ScramKey;
|
||||
|
||||
pub const SALTED_PASSWORD_LEN: usize = 32;
|
||||
|
||||
/// Salted hashed password is essential for [key](super::key) derivation.
|
||||
#[repr(transparent)]
|
||||
pub struct SaltedPassword {
|
||||
bytes: [u8; SALTED_PASSWORD_LEN],
|
||||
}
|
||||
|
||||
impl SaltedPassword {
|
||||
/// See `scram-common.c : scram_SaltedPassword` for details.
|
||||
/// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
|
||||
pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
|
||||
pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
|
||||
}
|
||||
|
||||
/// Derive `ClientKey` from a salted hashed password.
|
||||
pub fn client_key(&self) -> ScramKey {
|
||||
super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
|
||||
}
|
||||
|
||||
/// Derive `ServerKey` from a salted hashed password.
|
||||
pub fn server_key(&self) -> ScramKey {
|
||||
super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
|
||||
#[inline(always)]
|
||||
fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
|
||||
Self { bytes }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SaltedPassword;
|
||||
|
||||
fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
|
||||
let one = 1_u32.to_be_bytes(); // magic
|
||||
|
||||
let mut current = super::super::hmac_sha256(password, [salt, &one]);
|
||||
let mut result = current;
|
||||
for _ in 1..iterations {
|
||||
current = super::super::hmac_sha256(password, [current.as_ref()]);
|
||||
// TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
|
||||
for (i, x) in current.iter().enumerate() {
|
||||
result[i] ^= x;
|
||||
}
|
||||
}
|
||||
|
||||
result.into()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pbkdf2() {
|
||||
let password = "a-very-secure-password";
|
||||
let salt = "such-a-random-salt";
|
||||
let iterations = 4096;
|
||||
let output = [
|
||||
203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
|
||||
101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
|
||||
];
|
||||
|
||||
let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
|
||||
let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
|
||||
|
||||
assert_eq!(actual.bytes, output);
|
||||
assert_eq!(actual.bytes, expected.bytes);
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,7 @@
|
||||
use super::base64_decode_array;
|
||||
use super::key::ScramKey;
|
||||
|
||||
/// Server secret is produced from user's password,
|
||||
/// Server secret is produced from [password](super::password::SaltedPassword)
|
||||
/// and is used throughout the authentication process.
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct ServerSecret {
|
||||
@@ -59,10 +59,21 @@ impl ServerSecret {
|
||||
/// Build a new server secret from the prerequisites.
|
||||
/// XXX: We only use this function in tests.
|
||||
#[cfg(test)]
|
||||
pub fn build(password: &str) -> Option<Self> {
|
||||
Self::parse(&postgres_protocol::password::scram_sha_256(
|
||||
password.as_bytes(),
|
||||
))
|
||||
pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
|
||||
// TODO: implement proper password normalization required by the RFC
|
||||
if !password.is_ascii() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
|
||||
|
||||
Some(Self {
|
||||
iterations,
|
||||
salt_base64: base64::encode(salt),
|
||||
stored_key: password.client_key().sha256(),
|
||||
server_key: password.server_key(),
|
||||
doomed: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,4 +103,20 @@ mod tests {
|
||||
assert_eq!(base64::encode(parsed.stored_key), stored_key);
|
||||
assert_eq!(base64::encode(parsed.server_key), server_key);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_scram_secret() {
|
||||
let salt = b"salt";
|
||||
let secret = ServerSecret::build("password", salt, 4096).unwrap();
|
||||
assert_eq!(secret.iterations, 4096);
|
||||
assert_eq!(secret.salt_base64, base64::encode(salt));
|
||||
assert_eq!(
|
||||
base64::encode(secret.stored_key.as_ref()),
|
||||
"lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
|
||||
);
|
||||
assert_eq!(
|
||||
base64::encode(secret.server_key.as_ref()),
|
||||
"ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,10 +88,7 @@ pub async fn task_main(
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
|
||||
// prefer http2, but support http/1.1
|
||||
tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
|
||||
let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
|
||||
let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();
|
||||
|
||||
let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
|
||||
let _ = addr_incoming.set_nodelay(true);
|
||||
|
||||
@@ -12,7 +12,7 @@ use hyper::StatusCode;
|
||||
use hyper::{Body, HeaderMap, Request};
|
||||
use serde_json::json;
|
||||
use serde_json::Value;
|
||||
use tokio::try_join;
|
||||
use tokio::join;
|
||||
use tokio_postgres::error::DbError;
|
||||
use tokio_postgres::error::ErrorPosition;
|
||||
use tokio_postgres::GenericClient;
|
||||
@@ -32,9 +32,11 @@ use crate::auth::ComputeUserInfoParseError;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::config::TlsConfig;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::error::ReportableError;
|
||||
use crate::metrics::HTTP_CONTENT_LENGTH;
|
||||
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::serverless::backend::HttpConnError;
|
||||
use crate::DbName;
|
||||
use crate::RoleName;
|
||||
|
||||
@@ -164,12 +166,9 @@ fn get_conn_info(
|
||||
let mut options = Option::None;
|
||||
|
||||
for (key, value) in pairs {
|
||||
match &*key {
|
||||
"options" => {
|
||||
options = Some(NeonOptions::parse_options_raw(&value));
|
||||
}
|
||||
"application_name" => ctx.set_application(Some(value.into())),
|
||||
_ => {}
|
||||
if key == "options" {
|
||||
options = Some(NeonOptions::parse_options_raw(&value));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,10 +284,8 @@ pub async fn handle(
|
||||
)?
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
// TODO: when http error classification is done, distinguish between
|
||||
// timeout on sql vs timeout in proxy/cplane
|
||||
// ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
|
||||
Err(e) => {
|
||||
ctx.set_error_kind(e.get_error_kind());
|
||||
|
||||
let message = format!(
|
||||
"HTTP-Connection timed out, execution time exeeded {} seconds",
|
||||
@@ -402,11 +399,16 @@ async fn handle_inner(
|
||||
// not strictly necessary to mark success here,
|
||||
// but it's just insurance for if we forget it somewhere else
|
||||
ctx.latency_timer.success();
|
||||
Ok::<_, anyhow::Error>(client)
|
||||
Ok::<_, HttpConnError>(client)
|
||||
};
|
||||
|
||||
// Run both operations in parallel
|
||||
let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
|
||||
let (payload_result, auth_and_connect_result) =
|
||||
join!(fetch_and_process_request, authenticate_and_connect,);
|
||||
|
||||
// Handle the results
|
||||
let payload = payload_result?; // Handle errors appropriately
|
||||
let mut client = auth_and_connect_result?; // Handle errors appropriately
|
||||
|
||||
let mut response = Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
|
||||
@@ -38,22 +38,17 @@ pytest-rerunfailures = "^13.0"
|
||||
types-pytest-lazy-fixture = "^0.6.3.3"
|
||||
pytest-split = "^0.8.1"
|
||||
zstandard = "^0.21.0"
|
||||
httpx = {extras = ["http2"], version = "^0.26.0"}
|
||||
pytest-repeat = "^0.9.3"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
mypy = "==1.3.0"
|
||||
ruff = "^0.2.2"
|
||||
ruff = "^0.1.11"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.mypy]
|
||||
exclude = [
|
||||
"^vendor/",
|
||||
"^target/",
|
||||
]
|
||||
exclude = "^vendor/"
|
||||
check_untyped_defs = true
|
||||
# Help mypy find imports when running against list of individual files.
|
||||
# Without this line it would behave differently when executed on the entire project.
|
||||
@@ -77,13 +72,7 @@ ignore_missing_imports = true
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py39"
|
||||
extend-exclude = [
|
||||
"vendor/",
|
||||
"target/",
|
||||
]
|
||||
line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
|
||||
|
||||
[tool.ruff.lint]
|
||||
extend-exclude = ["vendor/"]
|
||||
ignore = [
|
||||
"E501", # Line too long, we don't want to be too strict about it
|
||||
]
|
||||
@@ -94,3 +83,4 @@ select = [
|
||||
"W", # pycodestyle
|
||||
"B", # bugbear
|
||||
]
|
||||
line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
|
||||
|
||||
@@ -12,12 +12,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
(Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Safekeeper auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
(Scope::PageServerApi, _) => Err(AuthError(
|
||||
"PageServerApi scope makes no sense for Safekeeper".into(),
|
||||
)),
|
||||
(Scope::SafekeeperData, _) => Ok(()),
|
||||
}
|
||||
|
||||
@@ -695,11 +695,9 @@ impl Collector for TimelineCollector {
|
||||
|
||||
// report total number of timelines
|
||||
self.timelines_count.set(timelines_count as i64);
|
||||
mfs.extend(self.timelines_count.collect());
|
||||
|
||||
self.active_timelines_count
|
||||
.set(active_timelines_count as i64);
|
||||
mfs.extend(self.active_timelines_count.collect());
|
||||
mfs.extend(self.timelines_count.collect());
|
||||
|
||||
mfs
|
||||
}
|
||||
|
||||
@@ -188,7 +188,7 @@ const reportSummary = async (params) => {
|
||||
}
|
||||
|
||||
const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
|
||||
let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n`
|
||||
let summary = `\n### Code coverage ([full report](${coverageUrl}))\n`
|
||||
|
||||
const coverage = await (await fetch(summaryJsonUrl)).json()
|
||||
for (const covType of Object.keys(coverage).sort()) {
|
||||
@@ -198,7 +198,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
|
||||
|
||||
summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n`
|
||||
}
|
||||
summary += "\n\\* collected from Rust tests only\n"
|
||||
|
||||
summary += `\n___\n`
|
||||
|
||||
return summary
|
||||
|
||||
@@ -4,8 +4,6 @@ from typing import Dict, List, Optional, Tuple
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
from prometheus_client.samples import Sample
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
class Metrics:
|
||||
metrics: Dict[str, List[Sample]]
|
||||
@@ -33,60 +31,6 @@ class Metrics:
|
||||
return res[0]
|
||||
|
||||
|
||||
class MetricsGetter:
|
||||
"""
|
||||
Mixin for types that implement a `get_metrics` function and would like associated
|
||||
helpers for querying the metrics
|
||||
"""
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_metric_value(
|
||||
self, name: str, filter: Optional[Dict[str, str]] = None
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
results = metrics.query_all(name, filter=filter)
|
||||
if not results:
|
||||
log.info(f'could not find metric "{name}"')
|
||||
return None
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
|
||||
def get_metrics_values(
|
||||
self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
When fetching multiple named metrics, it is more efficient to use this
|
||||
than to call `get_metric_value` repeatedly.
|
||||
|
||||
Throws RuntimeError if no metrics matching `names` are found, or if
|
||||
not all of `names` are found: this method is intended for loading sets
|
||||
of metrics whose existence is coupled.
|
||||
|
||||
If it's expected that there may be no results for some of the metrics,
|
||||
specify `absence_ok=True`. The returned dict will then not contain values
|
||||
for these metrics.
|
||||
"""
|
||||
metrics = self.get_metrics()
|
||||
samples = []
|
||||
for name in names:
|
||||
samples.extend(metrics.query_all(name, filter=filter))
|
||||
|
||||
result = {}
|
||||
for sample in samples:
|
||||
if sample.name in result:
|
||||
raise RuntimeError(f"Multiple values found for {sample.name}")
|
||||
result[sample.name] = sample.value
|
||||
|
||||
if not absence_ok:
|
||||
if len(result) != len(names):
|
||||
log.info(f"Metrics found: {metrics.metrics}")
|
||||
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_metrics(text: str, name: str = "") -> Metrics:
|
||||
metrics = Metrics(name)
|
||||
gen = text_string_to_metric_families(text)
|
||||
@@ -103,8 +47,7 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
|
||||
|
||||
|
||||
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_remote_timeline_client_calls_started_total",
|
||||
"pageserver_remote_timeline_client_calls_finished_total",
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
"pageserver_remote_physical_size",
|
||||
"pageserver_remote_timeline_client_bytes_started_total",
|
||||
"pageserver_remote_timeline_client_bytes_finished_total",
|
||||
@@ -133,6 +76,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
|
||||
*histogram("pageserver_wait_lsn_seconds"),
|
||||
*histogram("pageserver_remote_operation_seconds"),
|
||||
*histogram("pageserver_remote_timeline_client_calls_started"),
|
||||
*histogram("pageserver_io_operations_seconds"),
|
||||
"pageserver_tenant_states_count",
|
||||
)
|
||||
|
||||
@@ -27,7 +27,6 @@ from urllib.parse import quote, urlparse
|
||||
|
||||
import asyncpg
|
||||
import backoff
|
||||
import httpx
|
||||
import jwt
|
||||
import psycopg2
|
||||
import pytest
|
||||
@@ -47,7 +46,6 @@ from urllib3.util.retry import Retry
|
||||
from fixtures import overlayfs
|
||||
from fixtures.broker import NeonBroker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pageserver.allowed_errors import (
|
||||
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
|
||||
scan_pageserver_log_for_errors,
|
||||
@@ -488,11 +486,6 @@ class NeonEnvBuilder:
|
||||
|
||||
self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
|
||||
|
||||
self.pageserver_get_vectored_impl: Optional[str] = None
|
||||
if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
|
||||
self.pageserver_get_vectored_impl = "vectored"
|
||||
log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
), "Unexpectedly instantiated from outside a test function"
|
||||
@@ -1060,8 +1053,6 @@ class NeonEnv:
|
||||
}
|
||||
if self.pageserver_virtual_file_io_engine is not None:
|
||||
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
|
||||
if config.pageserver_get_vectored_impl is not None:
|
||||
ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
|
||||
|
||||
# Create a corresponding NeonPageserver object
|
||||
self.pageservers.append(
|
||||
@@ -1922,7 +1913,7 @@ class Pagectl(AbstractNeonCli):
|
||||
return IndexPartDump.from_json(parsed)
|
||||
|
||||
|
||||
class NeonAttachmentService(MetricsGetter):
|
||||
class NeonAttachmentService:
|
||||
def __init__(self, env: NeonEnv, auth_enabled: bool):
|
||||
self.env = env
|
||||
self.running = False
|
||||
@@ -1960,11 +1951,6 @@ class NeonAttachmentService(MetricsGetter):
|
||||
|
||||
return headers
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
|
||||
res.raise_for_status()
|
||||
return parse_metrics(res.text)
|
||||
|
||||
def ready(self) -> bool:
|
||||
resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
|
||||
if resp.status_code == 503:
|
||||
@@ -2108,17 +2094,6 @@ class NeonAttachmentService(MetricsGetter):
|
||||
log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
|
||||
assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
|
||||
|
||||
def consistency_check(self):
|
||||
"""
|
||||
Throw an exception if the service finds any inconsistencies in its state
|
||||
"""
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/debug/v1/consistency_check",
|
||||
)
|
||||
response.raise_for_status()
|
||||
log.info("Attachment service passed consistency check")
|
||||
|
||||
def __enter__(self) -> "NeonAttachmentService":
|
||||
return self
|
||||
|
||||
@@ -2864,34 +2839,9 @@ class NeonProxy(PgProtocol):
|
||||
)
|
||||
|
||||
if expected_code is not None:
|
||||
assert response.status_code == expected_code, f"response: {response.json()}"
|
||||
assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
|
||||
return response.json()
|
||||
|
||||
async def http2_query(self, query, args, **kwargs):
|
||||
# TODO maybe use default values if not provided
|
||||
user = kwargs["user"]
|
||||
password = kwargs["password"]
|
||||
expected_code = kwargs.get("expected_code")
|
||||
|
||||
connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
|
||||
async with httpx.AsyncClient(
|
||||
http2=True, verify=str(self.test_output_dir / "proxy.crt")
|
||||
) as client:
|
||||
response = await client.post(
|
||||
f"https://{self.domain}:{self.external_http_port}/sql",
|
||||
json={"query": query, "params": args},
|
||||
headers={
|
||||
"Content-Type": "application/sql",
|
||||
"Neon-Connection-String": connstr,
|
||||
"Neon-Pool-Opt-In": "true",
|
||||
},
|
||||
)
|
||||
assert response.http_version == "HTTP/2"
|
||||
|
||||
if expected_code is not None:
|
||||
assert response.status_code == expected_code, f"response: {response.json()}"
|
||||
return response.json()
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
|
||||
@@ -82,11 +82,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
# During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this
|
||||
# up is tracked in https://github.com/neondatabase/neon/issues/6096
|
||||
".*Cancelled, shutting down.*",
|
||||
# Open layers are only rolled at Lsn boundaries to avoid name clashses.
|
||||
# Hence, we can overshoot the soft limit set by checkpoint distance.
|
||||
# This is especially pronounced in tests that set small checkpoint
|
||||
# distances.
|
||||
".*Flushed oversized open layer with size.*",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.metrics import Metrics, parse_metrics
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import Fn
|
||||
@@ -125,7 +125,7 @@ class TenantConfig:
|
||||
)
|
||||
|
||||
|
||||
class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
class PageserverHttpClient(requests.Session):
|
||||
def __init__(
|
||||
self,
|
||||
port: int,
|
||||
@@ -694,33 +694,71 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
},
|
||||
).value
|
||||
|
||||
def get_remote_timeline_client_queue_count(
|
||||
def get_remote_timeline_client_metric(
|
||||
self,
|
||||
metric_name: str,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: str,
|
||||
op_kind: str,
|
||||
) -> Optional[int]:
|
||||
metrics = [
|
||||
"pageserver_remote_timeline_client_calls_started_total",
|
||||
"pageserver_remote_timeline_client_calls_finished_total",
|
||||
]
|
||||
res = self.get_metrics_values(
|
||||
metrics,
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
matches = metrics.query_all(
|
||||
name=metric_name,
|
||||
filter={
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"file_kind": str(file_kind),
|
||||
"op_kind": str(op_kind),
|
||||
},
|
||||
absence_ok=True,
|
||||
)
|
||||
if len(res) != 2:
|
||||
if len(matches) == 0:
|
||||
value = None
|
||||
elif len(matches) == 1:
|
||||
value = matches[0].value
|
||||
assert value is not None
|
||||
else:
|
||||
assert len(matches) < 2, "above filter should uniquely identify metric"
|
||||
return value
|
||||
|
||||
def get_metric_value(
|
||||
self, name: str, filter: Optional[Dict[str, str]] = None
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
results = metrics.query_all(name, filter=filter)
|
||||
if not results:
|
||||
log.info(f'could not find metric "{name}"')
|
||||
return None
|
||||
inc, dec = [res[metric] for metric in metrics]
|
||||
queue_count = int(inc) - int(dec)
|
||||
assert queue_count >= 0
|
||||
return queue_count
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
|
||||
def get_metrics_values(
|
||||
self, names: list[str], filter: Optional[Dict[str, str]] = None
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
When fetching multiple named metrics, it is more efficient to use this
|
||||
than to call `get_metric_value` repeatedly.
|
||||
|
||||
Throws RuntimeError if no metrics matching `names` are found, or if
|
||||
not all of `names` are found: this method is intended for loading sets
|
||||
of metrics whose existence is coupled.
|
||||
"""
|
||||
metrics = self.get_metrics()
|
||||
samples = []
|
||||
for name in names:
|
||||
samples.extend(metrics.query_all(name, filter=filter))
|
||||
|
||||
result = {}
|
||||
for sample in samples:
|
||||
if sample.name in result:
|
||||
raise RuntimeError(f"Multiple values found for {sample.name}")
|
||||
result[sample.name] = sample.value
|
||||
|
||||
if len(result) != len(names):
|
||||
log.info(f"Metrics found: {metrics.metrics}")
|
||||
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
|
||||
|
||||
return result
|
||||
|
||||
def layer_map_info(
|
||||
self,
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from mypy_boto3_s3.type_defs import (
|
||||
DeleteObjectOutputTypeDef,
|
||||
EmptyResponseMetadataTypeDef,
|
||||
ListObjectsV2OutputTypeDef,
|
||||
ObjectTypeDef,
|
||||
@@ -221,40 +220,16 @@ def wait_for_upload_queue_empty(
|
||||
):
|
||||
while True:
|
||||
all_metrics = pageserver_http.get_metrics()
|
||||
started = all_metrics.query_all(
|
||||
"pageserver_remote_timeline_client_calls_started_total",
|
||||
tl = all_metrics.query_all(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
{
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
)
|
||||
finished = all_metrics.query_all(
|
||||
"pageserver_remote_timeline_client_calls_finished_total",
|
||||
{
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
)
|
||||
assert len(started) == len(finished)
|
||||
# this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
|
||||
remaining_labels = ["shard_id", "file_kind", "op_kind"]
|
||||
tl: List[Tuple[Any, float]] = []
|
||||
for s in started:
|
||||
found = False
|
||||
for f in finished:
|
||||
if all([s.labels[label] == f.labels[label] for label in remaining_labels]):
|
||||
assert (
|
||||
not found
|
||||
), "duplicate match, remaining_labels don't uniquely identify sample"
|
||||
tl.append((s.labels, int(s.value) - int(f.value)))
|
||||
found = True
|
||||
if not found:
|
||||
tl.append((s.labels, int(s.value)))
|
||||
assert len(tl) == len(started), "something broken with join logic"
|
||||
log.info(f"upload queue for {tenant_id}/{timeline_id}:")
|
||||
for labels, queue_count in tl:
|
||||
log.info(f" {labels}: {queue_count}")
|
||||
if all(queue_count == 0 for (_, queue_count) in tl):
|
||||
assert len(tl) > 0
|
||||
log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
|
||||
if all(m.value == 0 for m in tl):
|
||||
return
|
||||
time.sleep(0.2)
|
||||
|
||||
@@ -356,6 +331,7 @@ def list_prefix(
|
||||
"""
|
||||
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
|
||||
assert isinstance(remote, S3Storage), "localfs is currently not supported"
|
||||
assert remote.client is not None
|
||||
|
||||
prefix_in_bucket = remote.prefix_in_bucket or ""
|
||||
if not prefix:
|
||||
@@ -374,29 +350,6 @@ def list_prefix(
|
||||
return response
|
||||
|
||||
|
||||
def remote_storage_delete_key(
|
||||
remote: RemoteStorage,
|
||||
key: str,
|
||||
) -> DeleteObjectOutputTypeDef:
|
||||
"""
|
||||
Note that this function takes into account prefix_in_bucket.
|
||||
"""
|
||||
# For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now.
|
||||
assert isinstance(remote, S3Storage), "localfs is currently not supported"
|
||||
|
||||
prefix_in_bucket = remote.prefix_in_bucket or ""
|
||||
|
||||
# real s3 tests have uniqie per test prefix
|
||||
# mock_s3 tests use special pageserver prefix for pageserver stuff
|
||||
key = "/".join((prefix_in_bucket, key))
|
||||
|
||||
response = remote.client.delete_object(
|
||||
Bucket=remote.bucket_name,
|
||||
Key=key,
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
def enable_remote_storage_versioning(
|
||||
remote: RemoteStorage,
|
||||
) -> EmptyResponseMetadataTypeDef:
|
||||
@@ -405,6 +358,7 @@ def enable_remote_storage_versioning(
|
||||
"""
|
||||
# local_fs has no support for versioning
|
||||
assert isinstance(remote, S3Storage), "localfs is currently not supported"
|
||||
assert remote.client is not None
|
||||
|
||||
# The SDK supports enabling versioning on normal S3 as well but we don't want to change
|
||||
# these settings from a test in a live bucket (also, our access isn't enough nor should it be)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user