mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-01 09:40:38 +00:00
Compare commits
60 Commits
readme_add
...
jcsp/foo-b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f12e438c09 | ||
|
|
7bae78186b | ||
|
|
7e560dd00e | ||
|
|
684e924211 | ||
|
|
8ace9ea25f | ||
|
|
6a4f49b08b | ||
|
|
c6e89445e2 | ||
|
|
04f32b9526 | ||
|
|
6f2333f52b | ||
|
|
d447f49bc3 | ||
|
|
c5972389aa | ||
|
|
c4f5736d5a | ||
|
|
518f598e2d | ||
|
|
4b711caf5e | ||
|
|
2cf47b1477 | ||
|
|
7dcfcccf7c | ||
|
|
a26cc29d92 | ||
|
|
5f2f31e879 | ||
|
|
938b163b42 | ||
|
|
5cbf5b45ae | ||
|
|
af5c54ed14 | ||
|
|
523cf71721 | ||
|
|
c47f355ec1 | ||
|
|
4f67b0225b | ||
|
|
2f7cecaf6a | ||
|
|
589594c2e1 | ||
|
|
70fe007519 | ||
|
|
b224a5a377 | ||
|
|
a65d437930 | ||
|
|
fc67f8dc60 | ||
|
|
2b65a2b53e | ||
|
|
9490360df4 | ||
|
|
91d947654e | ||
|
|
37aa6fd953 | ||
|
|
3ad567290c | ||
|
|
3a110e45ed | ||
|
|
e7e6319e20 | ||
|
|
d865881d59 | ||
|
|
1c5d6e59a0 | ||
|
|
263dfba6ee | ||
|
|
df3996265f | ||
|
|
29699529df | ||
|
|
f446e08fb8 | ||
|
|
4d5add9ca0 | ||
|
|
59b4c2eaf9 | ||
|
|
5432155b0d | ||
|
|
e16e82749f | ||
|
|
9f653893b9 | ||
|
|
913af44219 | ||
|
|
ecd615ab6d | ||
|
|
c9b2ec9ff1 | ||
|
|
a3800dcb0c | ||
|
|
9a32aa828d | ||
|
|
f03f7b3868 | ||
|
|
ec5dce04eb | ||
|
|
6014f15157 | ||
|
|
e675a21346 | ||
|
|
6b93230270 | ||
|
|
797aa4ffaa | ||
|
|
c45b56e0bb |
@@ -13,6 +13,7 @@
|
||||
# Directories
|
||||
!.cargo/
|
||||
!.config/
|
||||
!compute/
|
||||
!compute_tools/
|
||||
!control_plane/
|
||||
!libs/
|
||||
|
||||
10
.github/workflows/_build-and-test-locally.yml
vendored
10
.github/workflows/_build-and-test-locally.yml
vendored
@@ -257,7 +257,15 @@ jobs:
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: cp -a pg_install /tmp/neon/pg_install
|
||||
run: |
|
||||
# Use tar to copy files matching the pattern, preserving the paths in the destionation
|
||||
tar c \
|
||||
pg_install/v* \
|
||||
pg_install/build/*/src/test/regress/*.so \
|
||||
pg_install/build/*/src/test/regress/pg_regress \
|
||||
pg_install/build/*/src/test/isolation/isolationtester \
|
||||
pg_install/build/*/src/test/isolation/pg_isolation_regress \
|
||||
| tar x -C /tmp/neon
|
||||
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
|
||||
121
.github/workflows/build_and_test.yml
vendored
121
.github/workflows/build_and_test.yml
vendored
@@ -120,6 +120,59 @@ jobs:
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
# Check that the vendor/postgres-* submodules point to the
|
||||
# corresponding REL_*_STABLE_neon branches.
|
||||
check-submodules:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- uses: dorny/paths-filter@v3
|
||||
id: check-if-submodules-changed
|
||||
with:
|
||||
filters: |
|
||||
vendor:
|
||||
- 'vendor/**'
|
||||
|
||||
- name: Check vendor/postgres-v14 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v14"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
- name: Check vendor/postgres-v15 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v15"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
- name: Check vendor/postgres-v16 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v16"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
- name: Check vendor/postgres-v17 submodule reference
|
||||
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
|
||||
uses: jtmullen/submodule-branch-check-action@v1
|
||||
with:
|
||||
path: "vendor/postgres-v17"
|
||||
fetch_depth: "50"
|
||||
sub_fetch_depth: "50"
|
||||
pass_if_unchanged: true
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
strategy:
|
||||
@@ -159,6 +212,10 @@ jobs:
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
@@ -168,8 +225,6 @@ jobs:
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
- name: Run cargo clippy (release)
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
@@ -547,7 +602,20 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16, v17 ]
|
||||
version:
|
||||
# Much data was already generated on old PG versions with bullseye's
|
||||
# libraries, the locales of which can cause data incompatibilities.
|
||||
# However, new PG versions should check if they can be built on newer
|
||||
# images, as that reduces the support burden of old and ancient
|
||||
# distros.
|
||||
- pg: v14
|
||||
debian: bullseye-slim
|
||||
- pg: v15
|
||||
debian: bullseye-slim
|
||||
- pg: v16
|
||||
debian: bullseye-slim
|
||||
- pg: v17
|
||||
debian: bookworm-slim
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
@@ -590,41 +658,46 @@ jobs:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
DEBIAN_FLAVOR=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
file: compute/Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version == 'v16'
|
||||
if: matrix.version.pg == 'v16'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
DEBIAN_FLAVOR=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
file: compute/Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: matrix.version == 'v17'
|
||||
# We pick 16, because that builds on debian 11 with older glibc (and is
|
||||
# thus compatible with newer glibc), rather than 17 on Debian 12, as
|
||||
# that isn't guaranteed to be compatible with Debian 11
|
||||
if: matrix.version.pg == 'v16'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
target: compute-tools-image
|
||||
@@ -633,10 +706,11 @@ jobs:
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
DEBIAN_FLAVOR=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
file: compute/Dockerfile.compute-node
|
||||
tags: |
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -724,7 +798,7 @@ jobs:
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-spec=vm-image-spec.yaml \
|
||||
-spec=compute/vm-image-spec.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
@@ -788,6 +862,9 @@ jobs:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
id-token: write # for `aws-actions/configure-aws-credentials`
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16 v17
|
||||
|
||||
@@ -832,13 +909,19 @@ jobs:
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
|
||||
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Configure AWS-prod credentials
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
mask-aws-account-id: true
|
||||
role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
|
||||
|
||||
- name: Login to prod ECR
|
||||
uses: docker/login-action@v3
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
with:
|
||||
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
|
||||
password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
|
||||
|
||||
- name: Copy all images to prod ECR
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
|
||||
102
.github/workflows/cloud-regress.yml
vendored
Normal file
102
.github/workflows/cloud-regress.yml
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
name: Cloud Regression Test
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '45 1 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
regress:
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Patch the test
|
||||
run: |
|
||||
cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
|
||||
|
||||
- name: Generate a random password
|
||||
id: pwgen
|
||||
run: |
|
||||
set +x
|
||||
DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64)
|
||||
echo "::add-mask::${DBPASS//\//}"
|
||||
echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Change tests according to the generated password
|
||||
env:
|
||||
DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
|
||||
run: |
|
||||
cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
|
||||
for fname in sql/*.sql expected/*.out; do
|
||||
sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
|
||||
done
|
||||
for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do
|
||||
USER=$(echo "${ph}" | cut -c 22-)
|
||||
MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}')
|
||||
sed -i.bak "s/${ph}/${MD5}/" expected/password.out
|
||||
done
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run the regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: cloud_regress
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
extra_params: -m remote_cluster
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # on-call-staging-stream
|
||||
slack-message: |
|
||||
Periodic pg_regress on staging: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
4
.github/workflows/trigger-e2e-tests.yml
vendored
4
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -102,12 +102,12 @@ jobs:
|
||||
# Default set of platforms to run e2e tests on
|
||||
platforms='["docker", "k8s"]'
|
||||
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
|
||||
317
Cargo.lock
generated
317
Cargo.lock
generated
@@ -255,12 +255,6 @@ dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
|
||||
|
||||
[[package]]
|
||||
name = "atomic-take"
|
||||
version = "1.1.0"
|
||||
@@ -295,8 +289,8 @@ dependencies = [
|
||||
"fastrand 2.0.0",
|
||||
"hex",
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"ring 0.17.6",
|
||||
"hyper 0.14.30",
|
||||
"ring",
|
||||
"time",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -486,7 +480,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"p256 0.11.1",
|
||||
"percent-encoding",
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"sha2",
|
||||
"subtle",
|
||||
"time",
|
||||
@@ -593,7 +587,7 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body 1.0.0",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"hyper-rustls 0.24.0",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
@@ -684,7 +678,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"itoa",
|
||||
"matchit 0.7.0",
|
||||
"memchr",
|
||||
@@ -1089,9 +1083,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ciborium"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
|
||||
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
|
||||
dependencies = [
|
||||
"ciborium-io",
|
||||
"ciborium-ll",
|
||||
@@ -1100,18 +1094,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ciborium-io"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
|
||||
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
|
||||
|
||||
[[package]]
|
||||
name = "ciborium-ll"
|
||||
version = "0.2.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
|
||||
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
|
||||
dependencies = [
|
||||
"ciborium-io",
|
||||
"half 1.8.2",
|
||||
"half",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1224,7 +1218,7 @@ dependencies = [
|
||||
"compute_api",
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"nix 0.27.1",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
@@ -1327,10 +1321,9 @@ dependencies = [
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -2304,12 +2297,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "2.4.1"
|
||||
@@ -2411,17 +2398,6 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hostname"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"match_cfg",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hostname"
|
||||
version = "0.4.0"
|
||||
@@ -2430,7 +2406,7 @@ checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"windows 0.52.0",
|
||||
"windows",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2539,9 +2515,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "0.14.26"
|
||||
version = "0.14.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4"
|
||||
checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
@@ -2554,7 +2530,7 @@ dependencies = [
|
||||
"httpdate",
|
||||
"itoa",
|
||||
"pin-project-lite",
|
||||
"socket2 0.4.9",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -2589,7 +2565,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
|
||||
dependencies = [
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"log",
|
||||
"rustls 0.21.11",
|
||||
"rustls-native-certs 0.6.2",
|
||||
@@ -2620,7 +2596,7 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
|
||||
dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
@@ -2639,7 +2615,7 @@ dependencies = [
|
||||
"http-body 1.0.0",
|
||||
"hyper 1.2.0",
|
||||
"pin-project-lite",
|
||||
"socket2 0.5.5",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tower",
|
||||
"tower-service",
|
||||
@@ -2648,16 +2624,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.56"
|
||||
version = "0.1.61"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c"
|
||||
checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
|
||||
dependencies = [
|
||||
"android_system_properties",
|
||||
"core-foundation-sys",
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"windows 0.48.0",
|
||||
"windows-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2870,7 +2846,7 @@ dependencies = [
|
||||
"base64 0.21.1",
|
||||
"js-sys",
|
||||
"pem",
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"simple_asn1",
|
||||
@@ -2908,11 +2884,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
dependencies = [
|
||||
"spin 0.5.2",
|
||||
"spin",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2974,12 +2950,6 @@ dependencies = [
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "match_cfg"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.1.0"
|
||||
@@ -3072,15 +3042,6 @@ dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.9.0"
|
||||
@@ -3616,7 +3577,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"camino",
|
||||
"clap",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
@@ -3655,12 +3615,11 @@ dependencies = [
|
||||
"enumset",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"indoc",
|
||||
"itertools 0.10.5",
|
||||
"md5",
|
||||
@@ -3775,7 +3734,6 @@ dependencies = [
|
||||
"clap",
|
||||
"criterion",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex-literal",
|
||||
"itertools 0.10.5",
|
||||
"once_cell",
|
||||
@@ -3853,7 +3811,7 @@ dependencies = [
|
||||
"ahash",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"half 2.4.1",
|
||||
"half",
|
||||
"hashbrown 0.14.5",
|
||||
"num",
|
||||
"num-bigint",
|
||||
@@ -4140,7 +4098,7 @@ dependencies = [
|
||||
"crc32c",
|
||||
"env_logger",
|
||||
"log",
|
||||
"memoffset 0.8.0",
|
||||
"memoffset 0.9.0",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"regex",
|
||||
@@ -4345,17 +4303,16 @@ dependencies = [
|
||||
"fallible-iterator",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hostname 0.3.1",
|
||||
"hostname",
|
||||
"http 1.1.0",
|
||||
"http-body-util",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
@@ -4400,7 +4357,7 @@ dependencies = [
|
||||
"signature 2.2.0",
|
||||
"smallvec",
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
"socket2",
|
||||
"subtle",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
@@ -4578,7 +4535,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
|
||||
dependencies = [
|
||||
"pem",
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"time",
|
||||
"yasna",
|
||||
]
|
||||
@@ -4602,7 +4559,7 @@ dependencies = [
|
||||
"rustls-pki-types",
|
||||
"ryu",
|
||||
"sha1_smol",
|
||||
"socket2 0.5.5",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-util",
|
||||
@@ -4714,7 +4671,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"http-types",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -4747,7 +4704,7 @@ dependencies = [
|
||||
"h2 0.3.26",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"hyper-rustls 0.24.0",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
@@ -4905,21 +4862,6 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"spin 0.5.2",
|
||||
"untrusted 0.7.1",
|
||||
"web-sys",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.6"
|
||||
@@ -4929,8 +4871,8 @@ dependencies = [
|
||||
"cc",
|
||||
"getrandom 0.2.11",
|
||||
"libc",
|
||||
"spin 0.9.8",
|
||||
"untrusted 0.9.0",
|
||||
"spin",
|
||||
"untrusted",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
@@ -4950,7 +4892,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
|
||||
dependencies = [
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"lazy_static",
|
||||
"percent-encoding",
|
||||
"regex",
|
||||
@@ -5074,7 +5016,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
|
||||
dependencies = [
|
||||
"log",
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"rustls-webpki 0.101.7",
|
||||
"sct",
|
||||
]
|
||||
@@ -5086,7 +5028,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
|
||||
dependencies = [
|
||||
"log",
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki 0.102.2",
|
||||
"subtle",
|
||||
@@ -5143,24 +5085,14 @@ version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.100.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
|
||||
dependencies = [
|
||||
"ring 0.16.20",
|
||||
"untrusted 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.101.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
|
||||
dependencies = [
|
||||
"ring 0.17.6",
|
||||
"untrusted 0.9.0",
|
||||
"ring",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5169,9 +5101,9 @@ version = "0.102.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
|
||||
dependencies = [
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"untrusted 0.9.0",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5202,10 +5134,9 @@ dependencies = [
|
||||
"desim",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -5262,11 +5193,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.21"
|
||||
version = "0.1.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
|
||||
checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
|
||||
dependencies = [
|
||||
"windows-sys 0.42.0",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5290,8 +5221,8 @@ version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
|
||||
dependencies = [
|
||||
"ring 0.17.6",
|
||||
"untrusted 0.9.0",
|
||||
"ring",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5400,7 +5331,7 @@ version = "0.32.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
|
||||
dependencies = [
|
||||
"hostname 0.4.0",
|
||||
"hostname",
|
||||
"libc",
|
||||
"os_info",
|
||||
"rustc_version",
|
||||
@@ -5712,16 +5643,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.5.5"
|
||||
@@ -5732,12 +5653,6 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spin"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
||||
|
||||
[[package]]
|
||||
name = "spin"
|
||||
version = "0.9.8"
|
||||
@@ -5781,9 +5696,8 @@ dependencies = [
|
||||
"futures",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -5809,10 +5723,9 @@ dependencies = [
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"lasso",
|
||||
"measured",
|
||||
@@ -5862,7 +5775,6 @@ dependencies = [
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
@@ -6228,7 +6140,7 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
"socket2 0.5.5",
|
||||
"socket2",
|
||||
"tokio-macros",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
@@ -6288,7 +6200,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"socket2 0.5.5",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
@@ -6300,7 +6212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"rustls 0.22.4",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -6434,7 +6346,7 @@ dependencies = [
|
||||
"h2 0.3.26",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"hyper-timeout",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
@@ -6611,7 +6523,7 @@ dependencies = [
|
||||
name = "tracing-utils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"opentelemetry",
|
||||
"opentelemetry-otlp",
|
||||
"opentelemetry-semantic-conventions",
|
||||
@@ -6714,12 +6626,6 @@ version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.9.0"
|
||||
@@ -6728,17 +6634,18 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "ureq"
|
||||
version = "2.7.1"
|
||||
version = "2.9.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
|
||||
checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.22.1",
|
||||
"log",
|
||||
"once_cell",
|
||||
"rustls 0.21.11",
|
||||
"rustls-webpki 0.100.2",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki 0.102.2",
|
||||
"url",
|
||||
"webpki-roots 0.23.1",
|
||||
"webpki-roots 0.26.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6799,10 +6706,11 @@ dependencies = [
|
||||
"criterion",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -6837,11 +6745,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "1.6.1"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
|
||||
checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
|
||||
dependencies = [
|
||||
"atomic",
|
||||
"getrandom 0.2.11",
|
||||
"serde",
|
||||
]
|
||||
@@ -7075,15 +6982,6 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "0.23.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
|
||||
dependencies = [
|
||||
"rustls-webpki 0.100.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "0.25.2"
|
||||
@@ -7152,15 +7050,6 @@ version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
|
||||
dependencies = [
|
||||
"windows-targets 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.52.0"
|
||||
@@ -7180,21 +7069,6 @@ dependencies = [
|
||||
"windows-targets 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.42.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm 0.42.2",
|
||||
"windows_aarch64_msvc 0.42.2",
|
||||
"windows_i686_gnu 0.42.2",
|
||||
"windows_i686_msvc 0.42.2",
|
||||
"windows_x86_64_gnu 0.42.2",
|
||||
"windows_x86_64_gnullvm 0.42.2",
|
||||
"windows_x86_64_msvc 0.42.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
@@ -7243,12 +7117,6 @@ dependencies = [
|
||||
"windows_x86_64_msvc 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.48.0"
|
||||
@@ -7261,12 +7129,6 @@ version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.48.0"
|
||||
@@ -7279,12 +7141,6 @@ version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.48.0"
|
||||
@@ -7297,12 +7153,6 @@ version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.48.0"
|
||||
@@ -7315,12 +7165,6 @@ version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.48.0"
|
||||
@@ -7333,12 +7177,6 @@ version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.48.0"
|
||||
@@ -7351,12 +7189,6 @@ version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.48.0"
|
||||
@@ -7433,10 +7265,11 @@ dependencies = [
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"getrandom 0.2.11",
|
||||
"half",
|
||||
"hashbrown 0.14.5",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.26",
|
||||
"hyper 0.14.30",
|
||||
"indexmap 1.9.3",
|
||||
"itertools 0.10.5",
|
||||
"itertools 0.12.1",
|
||||
@@ -7504,7 +7337,7 @@ dependencies = [
|
||||
"der 0.7.8",
|
||||
"hex",
|
||||
"pem",
|
||||
"ring 0.17.6",
|
||||
"ring",
|
||||
"signature 2.2.0",
|
||||
"spki 0.7.3",
|
||||
"thiserror",
|
||||
|
||||
@@ -76,8 +76,6 @@ clap = { version = "4.0", features = ["derive"] }
|
||||
comfy-table = "7.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-deque = "0.8.5"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
@@ -95,7 +93,7 @@ hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.3.1"
|
||||
hostname = "0.4"
|
||||
http = {version = "1.1.0", features = ["std"]}
|
||||
http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
@@ -104,7 +102,6 @@ hyper = "0.14"
|
||||
tokio-tungstenite = "0.20.0"
|
||||
indexmap = "2"
|
||||
indoc = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
@@ -113,7 +110,7 @@ libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.8"
|
||||
memoffset = "0.9"
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
@@ -142,7 +139,6 @@ rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = "0.22"
|
||||
rustls-pemfile = "2"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
@@ -164,7 +160,6 @@ strum_macros = "0.26"
|
||||
svg_fmt = "0.4.3"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.3"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
[](https://neon.tech)
|
||||
|
||||
|
||||
foo
|
||||
|
||||
|
||||
# Neon
|
||||
|
||||
|
||||
@@ -3,13 +3,15 @@ ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
ARG DEBIAN_FLAVOR=bullseye-slim
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
FROM debian:$DEBIAN_FLAVOR AS build-deps
|
||||
ARG DEBIAN_FLAVOR
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
@@ -280,7 +282,7 @@ FROM build-deps AS vector-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
@@ -366,7 +368,7 @@ FROM build-deps AS rum-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/rum.patch /rum.patch
|
||||
COPY compute/patches/rum.patch /rum.patch
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
@@ -1027,10 +1029,47 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:bullseye-slim AS compute-tools-image
|
||||
FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
|
||||
ARG DEBIAN_FLAVOR
|
||||
|
||||
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgbouncer"
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:$DEBIAN_FLAVOR AS pgbouncer
|
||||
ARG DEBIAN_FLAVOR
|
||||
RUN set -e \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
libevent-dev \
|
||||
libtool \
|
||||
pkg-config
|
||||
|
||||
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
|
||||
ENV PGBOUNCER_TAG=pgbouncer_1_22_1
|
||||
RUN set -e \
|
||||
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
|
||||
&& cd pgbouncer \
|
||||
&& ./autogen.sh \
|
||||
&& LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
|
||||
&& make -j $(nproc) dist_man_MANS= \
|
||||
&& make install dist_man_MANS=
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "postgres-exporter" and "sql-exporter"
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
@@ -1078,7 +1117,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY patches/rum.patch /ext-src
|
||||
COPY compute/patches/rum.patch /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
@@ -1086,9 +1125,9 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hint_plan.patch /ext-src
|
||||
COPY compute/patches/pg_hint_plan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
COPY compute/patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
@@ -1097,7 +1136,7 @@ COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
|
||||
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
|
||||
COPY patches/pg_anon.patch /ext-src
|
||||
COPY compute/patches/pg_anon.patch /ext-src
|
||||
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
|
||||
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
@@ -1144,7 +1183,9 @@ ENV PGDATABASE=postgres
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim
|
||||
FROM debian:$DEBIAN_FLAVOR
|
||||
ARG DEBIAN_FLAVOR
|
||||
ENV DEBIAN_FLAVOR=$DEBIAN_FLAVOR
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
@@ -1160,23 +1201,50 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# pgbouncer and its config
|
||||
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
|
||||
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
|
||||
|
||||
# Metrics exporter binaries and configuration files
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# liblz4-1 for lz4
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libgeos, libsfcgal1, and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost* for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
|
||||
|
||||
RUN apt update && \
|
||||
case $DEBIAN_FLAVOR in \
|
||||
# Version-specific installs for Bullseye (PG14-PG16):
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# libgdal28, libproj19 for PostGIS
|
||||
bullseye*) \
|
||||
VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \
|
||||
;; \
|
||||
# Version-specific installs for Bookworm (PG17):
|
||||
# libicu72, locales for collations (including ICU and plpgsql_check)
|
||||
# libgdal32, libproj25 for PostGIS
|
||||
bookworm*) \
|
||||
VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \
|
||||
;; \
|
||||
esac && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
libicu67 \
|
||||
liblz4-1 \
|
||||
libreadline8 \
|
||||
libboost-iostreams1.74.0 \
|
||||
@@ -1185,8 +1253,6 @@ RUN apt update && \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
libprotobuf-c1 \
|
||||
libsfcgal1 \
|
||||
libxml2 \
|
||||
@@ -1195,7 +1261,8 @@ RUN apt update && \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
ca-certificates && \
|
||||
ca-certificates \
|
||||
$VERSION_INSTALLS && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
21
compute/README.md
Normal file
21
compute/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
This directory contains files that are needed to build the compute
|
||||
images, or included in the compute images.
|
||||
|
||||
Dockerfile.compute-node
|
||||
To build the compute image
|
||||
|
||||
vm-image-spec.yaml
|
||||
Instructions for vm-builder, to turn the compute-node image into
|
||||
corresponding vm-compute-node image.
|
||||
|
||||
etc/
|
||||
Configuration files included in /etc in the compute image
|
||||
|
||||
patches/
|
||||
Some extensions need to be patched to work with Neon. This
|
||||
directory contains such patches. They are applied to the extension
|
||||
sources in Dockerfile.compute-node
|
||||
|
||||
In addition to these, postgres itself, the neon postgres extension,
|
||||
and compute_ctl are built and copied into the compute image by
|
||||
Dockerfile.compute-node.
|
||||
246
compute/etc/neon_collector.yml
Normal file
246
compute/etc/neon_collector.yml
Normal file
@@ -0,0 +1,246 @@
|
||||
collector_name: neon_collector
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: connection_counts
|
||||
type: gauge
|
||||
help: 'Connection counts'
|
||||
key_labels:
|
||||
- datname
|
||||
- state
|
||||
values: [count]
|
||||
query: |
|
||||
select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
|
||||
|
||||
- metric_name: pg_stats_userdb
|
||||
type: gauge
|
||||
help: 'Stats for several oldest non-system dbs'
|
||||
key_labels:
|
||||
- datname
|
||||
value_label: kind
|
||||
values:
|
||||
- db_size
|
||||
- deadlocks
|
||||
# Rows
|
||||
- inserted
|
||||
- updated
|
||||
- deleted
|
||||
# We export stats for 10 non-system database. Without this limit
|
||||
# it is too easy to abuse the system by creating lots of databases.
|
||||
query: |
|
||||
select pg_database_size(datname) as db_size, deadlocks,
|
||||
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
||||
datname
|
||||
from pg_stat_database
|
||||
where datname IN (
|
||||
select datname
|
||||
from pg_database
|
||||
where datname <> 'postgres' and not datistemplate
|
||||
order by oid
|
||||
limit 10
|
||||
);
|
||||
|
||||
- metric_name: max_cluster_size
|
||||
type: gauge
|
||||
help: 'neon.max_cluster_size setting'
|
||||
key_labels:
|
||||
values: [max_cluster_size]
|
||||
query: |
|
||||
select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
|
||||
|
||||
- metric_name: db_total_size
|
||||
type: gauge
|
||||
help: 'Size of all databases'
|
||||
key_labels:
|
||||
values: [total]
|
||||
query: |
|
||||
select sum(pg_database_size(datname)) as total from pg_database;
|
||||
|
||||
# DEPRECATED
|
||||
- metric_name: lfc_approximate_working_set_size
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels:
|
||||
values: [approximate_working_set_size]
|
||||
query: |
|
||||
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration]
|
||||
values: [size]
|
||||
# NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
|
||||
# of durations in a pretty-printed form.
|
||||
query: |
|
||||
select
|
||||
x as duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
|
||||
from
|
||||
(values ('5m'),('15m'),('1h')) as t (x);
|
||||
|
||||
- metric_name: compute_current_lsn
|
||||
type: gauge
|
||||
help: 'Current LSN of the database'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
select
|
||||
case
|
||||
when pg_catalog.pg_is_in_recovery()
|
||||
then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
else (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
end as lsn;
|
||||
|
||||
- metric_name: compute_receive_lsn
|
||||
type: gauge
|
||||
help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery()
|
||||
THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
|
||||
- metric_name: replication_delay_bytes
|
||||
type: gauge
|
||||
help: 'Bytes between received and replayed LSN'
|
||||
key_labels:
|
||||
values: [replication_delay_bytes]
|
||||
# We use a GREATEST call here because this calculation can be negative.
|
||||
# The calculation is not atomic, meaning after we've gotten the receive
|
||||
# LSN, the replay LSN may have advanced past the receive LSN we
|
||||
# are using for the calculation.
|
||||
query: |
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
|
||||
- metric_name: replication_delay_seconds
|
||||
type: gauge
|
||||
help: 'Time since last LSN was replayed'
|
||||
key_labels:
|
||||
values: [replication_delay_seconds]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
|
||||
- metric_name: checkpoints_req
|
||||
type: gauge
|
||||
help: 'Number of requested checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_req]
|
||||
query: |
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: checkpoints_timed
|
||||
type: gauge
|
||||
help: 'Number of scheduled checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_timed]
|
||||
query: |
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: compute_logical_snapshot_files
|
||||
type: gauge
|
||||
help: 'Number of snapshot files in pg_logical/snapshot'
|
||||
key_labels:
|
||||
- timeline_id
|
||||
values: [num_logical_snapshot_files]
|
||||
query: |
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
|
||||
-- temporary snapshot files are renamed to the actual snapshot files after they are
|
||||
-- completely built. We only WAL-log the completely built snapshot files.
|
||||
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
|
||||
# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
|
||||
# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
|
||||
|
||||
# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
|
||||
- metric_name: logical_slot_restart_lsn
|
||||
type: gauge
|
||||
help: 'restart_lsn of logical slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [restart_lsn]
|
||||
query: |
|
||||
select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
from pg_replication_slots
|
||||
where slot_type = 'logical';
|
||||
|
||||
- metric_name: compute_subscriptions_count
|
||||
type: gauge
|
||||
help: 'Number of logical replication subscriptions grouped by enabled/disabled'
|
||||
key_labels:
|
||||
- enabled
|
||||
values: [subscriptions_count]
|
||||
query: |
|
||||
select subenabled::text as enabled, count(*) as subscriptions_count
|
||||
from pg_subscription
|
||||
group by subenabled;
|
||||
|
||||
- metric_name: retained_wal
|
||||
type: gauge
|
||||
help: 'Retained WAL in inactive replication slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [retained_wal]
|
||||
query: |
|
||||
SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
- metric_name: wal_is_lost
|
||||
type: gauge
|
||||
help: 'Whether or not the replication slot wal_status is lost'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [wal_is_lost]
|
||||
query: |
|
||||
SELECT slot_name,
|
||||
CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
55
compute/etc/neon_collector_autoscaling.yml
Normal file
55
compute/etc/neon_collector_autoscaling.yml
Normal file
@@ -0,0 +1,55 @@
|
||||
collector_name: neon_collector_autoscaling
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration_seconds]
|
||||
values: [size]
|
||||
# NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
|
||||
# size looking back 1..60 minutes, labeled with the number of minutes.
|
||||
query: |
|
||||
select
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) as size
|
||||
from
|
||||
(select generate_series * 60 as x from generate_series(1, 60)) as t (x);
|
||||
17
compute/etc/pgbouncer.ini
Normal file
17
compute/etc/pgbouncer.ini
Normal file
@@ -0,0 +1,17 @@
|
||||
[databases]
|
||||
*=host=localhost port=5432 auth_user=cloud_admin
|
||||
[pgbouncer]
|
||||
listen_port=6432
|
||||
listen_addr=0.0.0.0
|
||||
auth_type=scram-sha-256
|
||||
auth_user=cloud_admin
|
||||
auth_dbname=postgres
|
||||
client_tls_sslmode=disable
|
||||
server_tls_sslmode=disable
|
||||
pool_mode=transaction
|
||||
max_client_conn=10000
|
||||
default_pool_size=64
|
||||
max_prepared_statements=0
|
||||
admin_users=postgres
|
||||
unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
33
compute/etc/sql_exporter.yml
Normal file
33
compute/etc/sql_exporter.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Configuration for sql_exporter
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector.yml"
|
||||
33
compute/etc/sql_exporter_autoscaling.yml
Normal file
33
compute/etc/sql_exporter_autoscaling.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Configuration for sql_exporter for autoscaling-agent
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector_autoscaling]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector_autoscaling.yml"
|
||||
3949
compute/patches/cloud_regress_pg16.patch
Normal file
3949
compute/patches/cloud_regress_pg16.patch
Normal file
File diff suppressed because it is too large
Load Diff
112
compute/vm-image-spec.yaml
Normal file
112
compute/vm-image-spec.yaml
Normal file
@@ -0,0 +1,112 @@
|
||||
# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
|
||||
---
|
||||
commands:
|
||||
- name: cgconfigparser
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
|
||||
# restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
|
||||
# running it as root.
|
||||
- name: chmod-resize-swap
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/resize-swap'
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
|
||||
- name: sql-exporter-autoscaling
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: compute_ctl-resize-swap
|
||||
content: |
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = postgres;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ENV LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
RUN set -exu \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
git \
|
||||
ca-certificates \
|
||||
automake \
|
||||
cmake \
|
||||
make \
|
||||
gcc \
|
||||
byacc \
|
||||
flex \
|
||||
libtool \
|
||||
libpam0g-dev \
|
||||
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
|
||||
&& INSTALL_DIR="/libcgroup-install" \
|
||||
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
|
||||
&& cd libcgroup \
|
||||
# extracted from bootstrap.sh, with modified flags:
|
||||
&& (test -d m4 || mkdir m4) \
|
||||
&& autoreconf -fi \
|
||||
&& rm -rf autom4te.cache \
|
||||
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
|
||||
# actually build the thing...
|
||||
&& make install
|
||||
merge: |
|
||||
# tweak nofile limits
|
||||
RUN set -e \
|
||||
&& echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
|
||||
&& test ! -e /etc/security || ( \
|
||||
echo '* - nofile 1048576' >>/etc/security/limits.conf \
|
||||
&& echo 'root - nofile 1048576' >>/etc/security/limits.conf \
|
||||
)
|
||||
|
||||
# Allow postgres user (compute_ctl) to run swap resizer.
|
||||
# Need to install sudo in order to allow this.
|
||||
#
|
||||
# Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
sudo \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
|
||||
|
||||
COPY cgconfig.conf /etc/cgconfig.conf
|
||||
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
@@ -9,7 +9,6 @@ anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -346,7 +346,14 @@ impl StorageController {
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
|
||||
let initdb_args = [
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"--username",
|
||||
&username(),
|
||||
"--no-sync",
|
||||
"--no-instructions",
|
||||
];
|
||||
tracing::info!(
|
||||
"Initializing storage controller database with args: {:?}",
|
||||
initdb_args
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id,
|
||||
availability_zone_id: AvailabilityZone(availability_zone_id),
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
# Example docker compose configuration
|
||||
|
||||
The configuration in this directory is used for testing Neon docker images: it is
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a minature Neon system, use `cargo neon` rather than container images.
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a miniature Neon system, use `cargo neon` rather than container images.
|
||||
|
||||
This configuration does not start the storage controller, because the controller
|
||||
needs a way to reconfigure running computes, and no such thing exists in this setup.
|
||||
|
||||
343
docs/rfcs/038-independent-compute-release.md
Normal file
343
docs/rfcs/038-independent-compute-release.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# Independent compute release
|
||||
|
||||
Created at: 2024-08-30. Author: Alexey Kondratov (@ololobus)
|
||||
|
||||
## Summary
|
||||
|
||||
This document proposes an approach to fully independent compute release flow. It attempts to
|
||||
cover the following features:
|
||||
|
||||
- Process is automated as much as possible to minimize human errors.
|
||||
- Compute<->storage protocol compatibility is ensured.
|
||||
- A transparent release history is available with an easy rollback strategy.
|
||||
- Although not in the scope of this document, there is a viable way to extend the proposed release
|
||||
flow to achieve the canary and/or blue-green deployment strategies.
|
||||
|
||||
## Motivation
|
||||
|
||||
Previously, the compute release was tightly coupled to the storage release. This meant that once
|
||||
some storage nodes got restarted with a newer version, all new compute starts using these nodes
|
||||
automatically got a new version. Thus, two releases happen in parallel, which increases the blast
|
||||
radius and makes ownership fuzzy.
|
||||
|
||||
Now, we practice a manual v0 independent compute release flow -- after getting a new compute release
|
||||
image and tag, we pin it region by region using Admin UI. It's better, but it still has its own flaws:
|
||||
|
||||
1. It's a simple but fairly manual process, as you need to click through a few pages.
|
||||
2. It's prone to human errors, e.g., you could mistype or copy the wrong compute tag.
|
||||
3. We now require an additional approval in the Admin UI, which partially solves the 2.,
|
||||
but also makes the whole process pretty annoying, as you constantly need to go back
|
||||
and forth between two people.
|
||||
|
||||
## Non-goals
|
||||
|
||||
It's not the goal of this document to propose a design for some general-purpose release tool like Helm.
|
||||
The document considers how the current compute fleet is orchestrated at Neon. Even if we later
|
||||
decide to split the control plane further (e.g., introduce a separate compute controller), the proposed
|
||||
release process shouldn't change much, i.e., the releases table and API will reside in
|
||||
one of the parts.
|
||||
|
||||
Achieving the canary and/or blue-green deploy strategies is out of the scope of this document. They
|
||||
were kept in mind, though, so it's expected that the proposed approach will lay down the foundation
|
||||
for implementing them in future iterations.
|
||||
|
||||
## Impacted components
|
||||
|
||||
Compute, control plane, CI, observability (some Grafana dashboards may require changes).
|
||||
|
||||
## Prior art
|
||||
|
||||
One of the very close examples is how Helm tracks [releases history](https://helm.sh/docs/helm/helm_history/).
|
||||
|
||||
In the code:
|
||||
|
||||
- [Release](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/release.go#L20-L43)
|
||||
- [Release info](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/info.go#L24-L40)
|
||||
- [Release status](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/status.go#L18-L42)
|
||||
|
||||
TL;DR it has several important attributes:
|
||||
|
||||
- Revision -- unique release ID/primary key. It is not the same as the application version,
|
||||
because the same version can be deployed several times, e.g., after a newer version rollback.
|
||||
- App version -- version of the application chart/code.
|
||||
- Config -- set of overrides to the default config of the application.
|
||||
- Status -- current status of the release in the history.
|
||||
- Timestamps -- tracks when a release was created and deployed.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### Separate release branch
|
||||
|
||||
We will use a separate release branch, `release-compute`, to have a clean history for releases and commits.
|
||||
In order to avoid confusion with storage releases, we will use a different prefix for compute [git release
|
||||
tags](https://github.com/neondatabase/neon/releases) -- `release-compute-XXXX`. We will use the same tag for
|
||||
Docker images as well. The `neondatabase/compute-node-v16:release-compute-XXXX` looks longer and a bit redundant,
|
||||
but it's better to have image and git tags in sync.
|
||||
|
||||
Currently, control plane relies on the numeric compute and storage release versions to decide on compute->storage
|
||||
compatibility. Once we implement this proposal, we should drop this code as release numbers will be completely
|
||||
independent. The only constraint we want is that it must monotonically increase within the same release branch.
|
||||
|
||||
### Compute config/settings manifest
|
||||
|
||||
We will create a new sub-directory `compute` and file `compute/manifest.yaml` with a structure:
|
||||
|
||||
```yaml
|
||||
pg_settings:
|
||||
# Common settings for primaries and secondaries of all versions.
|
||||
common:
|
||||
wal_log_hints: "off"
|
||||
max_wal_size: "1024"
|
||||
|
||||
per_version:
|
||||
14:
|
||||
# Common settings for both replica and primary of version PG 14
|
||||
common:
|
||||
shared_preload_libraries: "neon,pg_stat_statements,extension_x"
|
||||
15:
|
||||
common:
|
||||
shared_preload_libraries: "neon,pg_stat_statements,extension_x"
|
||||
# Settings that should be applied only to
|
||||
replica:
|
||||
# Available only starting Postgres 15th
|
||||
recovery_prefetch: "off"
|
||||
# ...
|
||||
17:
|
||||
common:
|
||||
# For example, if third-party `extension_x` is not yet available for PG 17
|
||||
shared_preload_libraries: "neon,pg_stat_statements"
|
||||
replica:
|
||||
recovery_prefetch: "off"
|
||||
```
|
||||
|
||||
**N.B.** Setting value should be a string with `on|off` for booleans and a number (as a string)
|
||||
without units for all numeric settings. That's how the control plane currently operates.
|
||||
|
||||
The priority of settings will be (a higher number is a higher priority):
|
||||
|
||||
1. Any static and hard-coded settings in the control plane
|
||||
2. `pg_settings->common`
|
||||
3. Per-version `common`
|
||||
4. Per-version `replica`
|
||||
5. Any per-user/project/endpoint overrides in the control plane
|
||||
6. Any dynamic setting calculated based on the compute size
|
||||
|
||||
**N.B.** For simplicity, we do not do any custom logic for `shared_preload_libraries`, so it's completely
|
||||
overridden if specified on some level. Make sure that you include all necessary extensions in it when you
|
||||
do any overrides.
|
||||
|
||||
**N.B.** There is a tricky question about what to do with custom compute image pinning we sometimes
|
||||
do for particular projects and customers. That's usually some ad-hoc work and images are based on
|
||||
the latest compute image, so it's relatively safe to assume that we could use settings from the latest compute
|
||||
release. If for some reason that's not true, and further overrides are needed, it's also possible to do
|
||||
on the project level together with pinning the image, so it's on-call/engineer/support responsibility to
|
||||
ensure that compute starts with the specified custom image. The only real risk is that compute image will get
|
||||
stale and settings from new releases will drift away, so eventually it will get something incompatible,
|
||||
but i) this is some operational issue, as we do not want stale images anyway, and ii) base settings
|
||||
receive something really new so rarely that the chance of this happening is very low. If we want to solve it completely,
|
||||
then together with pinning the image we could also pin the matching release revision in the control plane.
|
||||
|
||||
The compute team will own the content of `compute/manifest.yaml`.
|
||||
|
||||
### Control plane: releases table
|
||||
|
||||
In order to store information about releases, the control plane will use a table `compute_releases` with the following
|
||||
schema:
|
||||
|
||||
```sql
|
||||
CREATE TABLE compute_releases (
|
||||
-- Unique release ID
|
||||
-- N.B. Revision won't by synchronized across all regions, because all control planes are technically independent
|
||||
-- services. We have the same situation with Helm releases as well because they could be deployed and rolled back
|
||||
-- independently in different clusters.
|
||||
revision BIGSERIAL PRIMARY KEY,
|
||||
-- Numeric version of the compute image, e.g. 9057
|
||||
version BIGINT NOT NULL,
|
||||
-- Compute image tag, e.g. `release-9057`
|
||||
tag TEXT NOT NULL,
|
||||
-- Current release status. Currently, it will be a simple enum
|
||||
-- * `deployed` -- release is deployed and used for new compute starts.
|
||||
-- Exactly one release can have this status at a time.
|
||||
-- * `superseded` -- release has been replaced by a newer one.
|
||||
-- But we can always extend it in the future when we need more statuses
|
||||
-- for more complex deployment strategies.
|
||||
status TEXT NOT NULL,
|
||||
-- Any additional metadata for compute in the corresponding release
|
||||
manifest JSONB NOT NULL,
|
||||
-- Timestamp when release record was created in the control plane database
|
||||
created_at TIMESTAMP NOT NULL DEFAULT now(),
|
||||
-- Timestamp when release deployment was finished
|
||||
deployed_at TIMESTAMP
|
||||
);
|
||||
```
|
||||
|
||||
We keep track of the old releases not only for the sake of audit, but also because we usually have ~30% of
|
||||
old computes started using the image from one of the previous releases. Yet, when users want to reconfigure
|
||||
them without restarting, the control plane needs to know what settings are applicable to them, so we also need
|
||||
information about the previous releases that are readily available. There could be some other auxiliary info
|
||||
needed as well: supported extensions, compute flags, etc.
|
||||
|
||||
**N.B.** Here, we can end up in an ambiguous situation when the same compute image is deployed twice, e.g.,
|
||||
it was deployed once, then rolled back, and then deployed again, potentially with a different manifest. Yet,
|
||||
we could've started some computes with the first deployment and some with the second. Thus, when we need to
|
||||
look up the manifest for the compute by its image tag, we will see two records in the table with the same tag,
|
||||
but different revision numbers. We can assume that this could happen only in case of rollbacks, so we
|
||||
can just take the latest revision for the given tag.
|
||||
|
||||
### Control plane: management API
|
||||
|
||||
The control plane will implement new API methods to manage releases:
|
||||
|
||||
1. `POST /management/api/v2/compute_releases` to create a new release. With payload
|
||||
|
||||
```json
|
||||
{
|
||||
"version": 9057,
|
||||
"tag": "release-9057",
|
||||
"manifest": {}
|
||||
}
|
||||
```
|
||||
|
||||
and response
|
||||
|
||||
```json
|
||||
{
|
||||
"revision": 53,
|
||||
"version": 9057,
|
||||
"tag": "release-9057",
|
||||
"status": "deployed",
|
||||
"manifest": {},
|
||||
"created_at": "2024-08-15T15:52:01.0000Z",
|
||||
"deployed_at": "2024-08-15T15:52:01.0000Z",
|
||||
}
|
||||
```
|
||||
|
||||
Here, we can actually mix-in custom (remote) extensions metadata into the `manifest`, so that the control plane
|
||||
will get information about all available extensions not bundled into compute image. The corresponding
|
||||
workflow in `neondatabase/build-custom-extensions` should produce it as an artifact and make
|
||||
it accessible to the workflow in the `neondatabase/infra`. See the complete release flow below. Doing that,
|
||||
we put a constraint that new custom extension requires new compute release, which is good for the safety,
|
||||
but is not exactly what we want operational-wise (we want to be able to deploy new extensions without new
|
||||
images). Yet, it can be solved incrementally: v0 -- do not do anything with extensions at all;
|
||||
v1 -- put them into the same manifest; v2 -- make them separate entities with their own lifecycle.
|
||||
|
||||
**N.B.** This method is intended to be used in CI workflows, and CI/network can be flaky. It's reasonable
|
||||
to assume that we could retry the request several times, even though it's already succeeded. Although it's
|
||||
not a big deal to create several identical releases one-by-one, it's better to avoid it, so the control plane
|
||||
should check if the latest release is identical and just return `304 Not Modified` in this case.
|
||||
|
||||
2. `POST /management/api/v2/compute_releases/rollback` to rollback to any previously deployed release. With payload
|
||||
including the revision of the release to rollback to:
|
||||
|
||||
```json
|
||||
{
|
||||
"revision": 52
|
||||
}
|
||||
```
|
||||
|
||||
Rollback marks the current release as `superseded` and creates a new release with all the same data as the
|
||||
requested revision, but with a new revision number.
|
||||
|
||||
This rollback API is not strictly needed, as we can just use `infra` repo workflow to deploy any
|
||||
available tag. It's still nice to have for on-call and any urgent matters, for example, if we need
|
||||
to rollback and GitHub is down. It's much easier to specify only the revision number vs. crafting
|
||||
all the necessary data for the new release payload.
|
||||
|
||||
### Compute->storage compatibility tests
|
||||
|
||||
In order to safely release new compute versions independently from storage, we need to ensure that the currently
|
||||
deployed storage is compatible with the new compute version. Currently, we maintain backward compatibility
|
||||
in storage, but newer computes may require a newer storage version.
|
||||
|
||||
Remote end-to-end (e2e) tests [already accept](https://github.com/neondatabase/cloud/blob/e3468d433e0d73d02b7d7e738d027f509b522408/.github/workflows/testing.yml#L43-L48)
|
||||
`storage_image_tag` and `compute_image_tag` as separate inputs. That means that we could reuse e2e tests to ensure
|
||||
compatibility between storage and compute:
|
||||
|
||||
1. Pick the latest storage release tag and use it as `storage_image_tag`.
|
||||
2. Pick a new compute tag built in the current compute release PR and use it as `compute_image_tag`.
|
||||
Here, we should use a temporary ECR image tag, because the final tag will be known only after the release PR is merged.
|
||||
3. Trigger e2e tests as usual.
|
||||
|
||||
### Release flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
|
||||
actor oncall as Compute on-call person
|
||||
participant neon as neondatabase/neon
|
||||
|
||||
box private
|
||||
participant cloud as neondatabase/cloud
|
||||
participant exts as neondatabase/build-custom-extensions
|
||||
participant infra as neondatabase/infra
|
||||
end
|
||||
|
||||
box cloud
|
||||
participant preprod as Pre-prod control plane
|
||||
participant prod as Production control plane
|
||||
participant k8s as Compute k8s
|
||||
end
|
||||
|
||||
oncall ->> neon: Open release PR into release-compute
|
||||
|
||||
activate neon
|
||||
neon ->> cloud: CI: trigger e2e compatibility tests
|
||||
activate cloud
|
||||
cloud -->> neon: CI: e2e tests pass
|
||||
deactivate cloud
|
||||
neon ->> neon: CI: pass PR checks, get approvals
|
||||
deactivate neon
|
||||
|
||||
oncall ->> neon: Merge release PR into release-compute
|
||||
|
||||
activate neon
|
||||
neon ->> neon: CI: pass checks, build and push images
|
||||
neon ->> exts: CI: trigger extensions build
|
||||
activate exts
|
||||
exts -->> neon: CI: extensions are ready
|
||||
deactivate exts
|
||||
neon ->> neon: CI: create release tag
|
||||
neon ->> infra: Trigger release workflow using the produced tag
|
||||
deactivate neon
|
||||
|
||||
activate infra
|
||||
infra ->> infra: CI: pass checks
|
||||
infra ->> preprod: Release new compute image to pre-prod automatically <br/> POST /management/api/v2/compute_releases
|
||||
activate preprod
|
||||
preprod -->> infra: 200 OK
|
||||
deactivate preprod
|
||||
|
||||
infra ->> infra: CI: wait for per-region production deploy approvals
|
||||
oncall ->> infra: CI: approve deploys region by region
|
||||
infra ->> k8s: Prewarm new compute image
|
||||
infra ->> prod: POST /management/api/v2/compute_releases
|
||||
activate prod
|
||||
prod -->> infra: 200 OK
|
||||
deactivate prod
|
||||
deactivate infra
|
||||
```
|
||||
|
||||
## Further work
|
||||
|
||||
As briefly mentioned in other sections, eventually, we would like to use more complex deployment strategies.
|
||||
For example, we can pass a fraction of the total compute starts that should use the new release. Then we can
|
||||
mark the release as `partial` or `canary` and monitor its performance. If everything is fine, we can promote it
|
||||
to `deployed` status. If not, we can roll back to the previous one.
|
||||
|
||||
## Alternatives
|
||||
|
||||
In theory, we can try using Helm as-is:
|
||||
|
||||
1. Write a compute Helm chart. That will actually have only some config map, which the control plane can access and read.
|
||||
N.B. We could reuse the control plane chart as well, but then it's not a fully independent release again and even more fuzzy.
|
||||
2. The control plane will read it and start using the new compute version for new starts.
|
||||
|
||||
Drawbacks:
|
||||
|
||||
1. Helm releases work best if the workload is controlled by the Helm chart itself. Then you can have different
|
||||
deployment strategies like rolling update or canary or blue/green deployments. At Neon, the compute starts are controlled
|
||||
by control plane, so it makes it much more tricky.
|
||||
2. Releases visibility will suffer, i.e. instead of a nice table in the control plane and Admin UI, we would need to use
|
||||
`helm` cli and/or K8s UIs like K8sLens.
|
||||
3. We do not restart all computes shortly after the new version release. This means that for some features and compatibility
|
||||
purpose (see above) control plane may need some auxiliary info from the previous releases.
|
||||
@@ -104,9 +104,6 @@ pub struct ConfigToml {
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
#[serde(skip_serializing)]
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
|
||||
pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
|
||||
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
@@ -384,7 +381,6 @@ impl Default for ConfigToml {
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: None,
|
||||
compact_level0_phase1_value_access: Default::default(),
|
||||
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
|
||||
|
||||
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -57,7 +58,7 @@ pub struct NodeRegisterRequest {
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
pub availability_zone_id: String,
|
||||
pub availability_zone_id: AvailabilityZone,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -74,10 +75,19 @@ pub struct TenantPolicyRequest {
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct AvailabilityZone(pub String);
|
||||
|
||||
impl Display for AvailabilityZone {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsRequest {
|
||||
#[serde(flatten)]
|
||||
pub preferred_az_ids: HashMap<TenantShardId, String>,
|
||||
pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
||||
@@ -37,14 +37,11 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
/// ```mermaid
|
||||
/// stateDiagram-v2
|
||||
///
|
||||
/// [*] --> Loading: spawn_load()
|
||||
/// [*] --> Attaching: spawn_attach()
|
||||
///
|
||||
/// Loading --> Activating: activate()
|
||||
/// Attaching --> Activating: activate()
|
||||
/// Activating --> Active: infallible
|
||||
///
|
||||
/// Loading --> Broken: load() failure
|
||||
/// Attaching --> Broken: attach() failure
|
||||
///
|
||||
/// Active --> Stopping: set_stopping(), part of shutdown & detach
|
||||
@@ -68,10 +65,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
)]
|
||||
#[serde(tag = "slug", content = "data")]
|
||||
pub enum TenantState {
|
||||
/// This tenant is being loaded from local disk.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
Loading,
|
||||
/// This tenant is being attached to the pageserver.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
@@ -121,8 +114,6 @@ impl TenantState {
|
||||
// But, our attach task might still be fetching the remote timelines, etc.
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
Self::Active => Attached,
|
||||
@@ -191,10 +182,11 @@ impl LsnLease {
|
||||
}
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
///
|
||||
/// XXX: We used to have more variants here, but now it's just one, which makes this rather
|
||||
/// useless. Remove, once we've checked that there's no client code left that looks at this.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
|
||||
Loading,
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
|
||||
Attaching,
|
||||
}
|
||||
@@ -1562,11 +1554,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tenantstatus_activating_serde() {
|
||||
let states = [
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
];
|
||||
let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
|
||||
let states = [TenantState::Activating(ActivatingFrom::Attaching)];
|
||||
let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
|
||||
|
||||
let actual = serde_json::to_string(&states).unwrap();
|
||||
|
||||
@@ -1581,13 +1570,7 @@ mod tests {
|
||||
fn tenantstatus_activating_strum() {
|
||||
// tests added, because we use these for metrics
|
||||
let examples = [
|
||||
(line!(), TenantState::Loading, "Loading"),
|
||||
(line!(), TenantState::Attaching, "Attaching"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
"Activating",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
|
||||
@@ -19,6 +19,7 @@ bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
git-version.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
|
||||
@@ -82,7 +82,7 @@ impl ApiError {
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
}
|
||||
|
||||
@@ -21,7 +21,13 @@
|
||||
//!
|
||||
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
|
||||
|
||||
use std::{sync::Mutex, time::Duration};
|
||||
use std::{
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Mutex,
|
||||
},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use tokio::{sync::Notify, time::Instant};
|
||||
|
||||
@@ -128,6 +134,7 @@ impl LeakyBucketState {
|
||||
|
||||
pub struct RateLimiter {
|
||||
pub config: LeakyBucketConfig,
|
||||
pub sleep_counter: AtomicU64,
|
||||
pub state: Mutex<LeakyBucketState>,
|
||||
/// a queue to provide this fair ordering.
|
||||
pub queue: Notify,
|
||||
@@ -144,6 +151,7 @@ impl Drop for Requeue<'_> {
|
||||
impl RateLimiter {
|
||||
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
|
||||
RateLimiter {
|
||||
sleep_counter: AtomicU64::new(0),
|
||||
state: Mutex::new(LeakyBucketState::with_initial_tokens(
|
||||
&config,
|
||||
initial_tokens,
|
||||
@@ -163,15 +171,16 @@ impl RateLimiter {
|
||||
|
||||
/// returns true if we did throttle
|
||||
pub async fn acquire(&self, count: usize) -> bool {
|
||||
let mut throttled = false;
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
|
||||
let start_count = self.sleep_counter.load(Ordering::Acquire);
|
||||
let mut end_count = start_count;
|
||||
|
||||
// wait until we are the first in the queue
|
||||
let mut notified = std::pin::pin!(self.queue.notified());
|
||||
if !notified.as_mut().enable() {
|
||||
throttled = true;
|
||||
notified.await;
|
||||
end_count = self.sleep_counter.load(Ordering::Acquire);
|
||||
}
|
||||
|
||||
// notify the next waiter in the queue when we are done.
|
||||
@@ -184,9 +193,22 @@ impl RateLimiter {
|
||||
.unwrap()
|
||||
.add_tokens(&self.config, start, count as f64);
|
||||
match res {
|
||||
Ok(()) => return throttled,
|
||||
Ok(()) => return end_count > start_count,
|
||||
Err(ready_at) => {
|
||||
throttled = true;
|
||||
struct Increment<'a>(&'a AtomicU64);
|
||||
|
||||
impl Drop for Increment<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.0.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
}
|
||||
|
||||
// increment the counter after we finish sleeping (or cancel this task).
|
||||
// this ensures that tasks that have already started the acquire will observe
|
||||
// the new sleep count when they are allowed to resume on the notify.
|
||||
let _inc = Increment(&self.sleep_counter);
|
||||
end_count += 1;
|
||||
|
||||
tokio::time::sleep_until(ready_at).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,6 +92,10 @@ pub mod toml_edit_ext;
|
||||
|
||||
pub mod circuit_breaker;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
@@ -131,7 +135,7 @@ macro_rules! project_git_version {
|
||||
($const_identifier:ident) => {
|
||||
// this should try GIT_VERSION first only then git_version::git_version!
|
||||
const $const_identifier: &::core::primitive::str = {
|
||||
const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
|
||||
const __COMMIT_FROM_GIT: &::core::primitive::str = $crate::git_version::git_version! {
|
||||
prefix = "",
|
||||
fallback = "unknown",
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
|
||||
@@ -27,7 +27,6 @@ crc32c.workspace = true
|
||||
either.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Quantify a single walredo manager's throughput under N concurrent callers.
|
||||
//!
|
||||
//! The benchmark implementation ([`bench_impl`]) is parametrized by
|
||||
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
|
||||
//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
|
||||
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
|
||||
//! - `nclients` => number of clients (more on this shortly).
|
||||
//!
|
||||
@@ -10,7 +10,7 @@
|
||||
//! Each task executes the `redo_work` `n_redos/nclients` times.
|
||||
//!
|
||||
//! We exercise the following combinations:
|
||||
//! - `redo_work = short / medium``
|
||||
//! - `redo_work = ping / short / medium``
|
||||
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
|
||||
//!
|
||||
//! We let `criterion` determine the `n_redos` using `iter_custom`.
|
||||
@@ -27,33 +27,43 @@
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//! 2024-09-18 on im4gn.2xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! ping/1 time: [21.789 µs 21.918 µs 22.078 µs]
|
||||
//! ping/2 time: [27.686 µs 27.812 µs 27.970 µs]
|
||||
//! ping/4 time: [35.468 µs 35.671 µs 35.926 µs]
|
||||
//! ping/8 time: [59.682 µs 59.987 µs 60.363 µs]
|
||||
//! ping/16 time: [101.79 µs 102.37 µs 103.08 µs]
|
||||
//! ping/32 time: [184.18 µs 185.15 µs 186.36 µs]
|
||||
//! ping/64 time: [349.86 µs 351.45 µs 353.47 µs]
|
||||
//! ping/128 time: [684.53 µs 687.98 µs 692.17 µs]
|
||||
//! short/1 time: [31.833 µs 32.126 µs 32.428 µs]
|
||||
//! short/2 time: [35.558 µs 35.756 µs 35.992 µs]
|
||||
//! short/4 time: [44.850 µs 45.138 µs 45.484 µs]
|
||||
//! short/8 time: [65.985 µs 66.379 µs 66.853 µs]
|
||||
//! short/16 time: [127.06 µs 127.90 µs 128.87 µs]
|
||||
//! short/32 time: [252.98 µs 254.70 µs 256.73 µs]
|
||||
//! short/64 time: [497.13 µs 499.86 µs 503.26 µs]
|
||||
//! short/128 time: [987.46 µs 993.45 µs 1.0004 ms]
|
||||
//! medium/1 time: [137.91 µs 138.55 µs 139.35 µs]
|
||||
//! medium/2 time: [192.00 µs 192.91 µs 194.07 µs]
|
||||
//! medium/4 time: [389.62 µs 391.55 µs 394.01 µs]
|
||||
//! medium/8 time: [776.80 µs 780.33 µs 784.77 µs]
|
||||
//! medium/16 time: [1.5323 ms 1.5383 ms 1.5459 ms]
|
||||
//! medium/32 time: [3.0120 ms 3.0226 ms 3.0350 ms]
|
||||
//! medium/64 time: [5.7405 ms 5.7787 ms 5.8166 ms]
|
||||
//! medium/128 time: [10.412 ms 10.574 ms 10.718 ms]
|
||||
//! ```
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
future::Future,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
@@ -61,40 +71,59 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
macro_rules! bench_group {
|
||||
($name:expr, $redo_work:expr) => {{
|
||||
let name: &str = $name;
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(name);
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}};
|
||||
}
|
||||
//
|
||||
// benchmark the protocol implementation
|
||||
//
|
||||
let pg_version = 14;
|
||||
bench_group!(
|
||||
"ping",
|
||||
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
|
||||
let _: () = mgr.ping(pg_version).await.unwrap();
|
||||
})
|
||||
);
|
||||
//
|
||||
// benchmarks with actual record redo
|
||||
//
|
||||
let make_redo_work = |req: &'static Request| {
|
||||
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
|
||||
let page = req.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
})
|
||||
};
|
||||
bench_group!("short", {
|
||||
static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
|
||||
make_redo_work(&REQUEST)
|
||||
});
|
||||
bench_group!("medium", {
|
||||
static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
|
||||
make_redo_work(&REQUEST)
|
||||
});
|
||||
}
|
||||
criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
|
||||
where
|
||||
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
|
||||
Fut: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
@@ -135,17 +164,20 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
|
||||
})
|
||||
}
|
||||
|
||||
async fn client(
|
||||
async fn client<F, Fut>(
|
||||
mgr: Arc<PostgresRedoManager>,
|
||||
start: Arc<Barrier>,
|
||||
redo_work: Arc<Request>,
|
||||
redo_work: Arc<F>,
|
||||
n_redos: u64,
|
||||
) -> Duration {
|
||||
) -> Duration
|
||||
where
|
||||
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
|
||||
Fut: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
start.wait().await;
|
||||
let start = Instant::now();
|
||||
for _ in 0..n_redos {
|
||||
let page = redo_work.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
redo_work(Arc::clone(&mgr)).await;
|
||||
// The real pageserver will rarely if ever do 2 walredos in a row without
|
||||
// yielding to the executor.
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
@@ -432,7 +432,7 @@ impl Client {
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
self.request(Method::POST, &uri, req)
|
||||
self.request(Method::PUT, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
|
||||
@@ -12,7 +12,6 @@ anyhow.workspace = true
|
||||
async-stream.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
itertools.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -10,7 +10,6 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -324,7 +324,6 @@ impl PageServerConf {
|
||||
max_vectored_read_bytes,
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
compact_level0_phase1_value_access: _,
|
||||
l0_flush,
|
||||
virtual_file_direct_io,
|
||||
concurrent_tenant_warmup,
|
||||
@@ -535,16 +534,6 @@ mod tests {
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compactl0_phase1_access_mode_is_ignored_silently() {
|
||||
let input = indoc::indoc! {r#"
|
||||
[compact_level0_phase1_value_access]
|
||||
mode = "streaming-kmerge"
|
||||
validate = "key-lsn-value"
|
||||
"#};
|
||||
toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
|
||||
}
|
||||
|
||||
/// If there's a typo in the pageserver config, we'd rather catch that typo
|
||||
/// and fail pageserver startup than silently ignoring the typo, leaving whoever
|
||||
/// made it in the believe that their config change is effective.
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::{
|
||||
controller_api::NodeRegisterRequest,
|
||||
controller_api::{AvailabilityZone, NodeRegisterRequest},
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
@@ -148,10 +148,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
||||
|
||||
match az_id_from_metadata {
|
||||
Some(az_id) => Some(az_id),
|
||||
Some(az_id) => Some(AvailabilityZone(az_id)),
|
||||
None => {
|
||||
tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
|
||||
conf.availability_zone.clone()
|
||||
conf.availability_zone.clone().map(AvailabilityZone)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -589,6 +589,10 @@ async fn timeline_create_handler(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(e @ tenant::CreateTimelineError::AncestorArchived) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
HttpErrorBody::from_msg("tenant shutting down".to_string()),
|
||||
@@ -2955,7 +2959,7 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.post(
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|
||||
|r| api_handler(r, timeline_archival_config_handler),
|
||||
)
|
||||
|
||||
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
global_metric: &'a Histogram,
|
||||
global_latency_histo: &'a Histogram,
|
||||
|
||||
// Optional because not all op types are tracked per-timeline
|
||||
timeline_metric: Option<&'a Histogram>,
|
||||
per_timeline_latency_histo: Option<&'a Histogram>,
|
||||
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
@@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
self.global_metric.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(timeline_metric) = self.timeline_metric {
|
||||
timeline_metric.observe(ex_throttled.as_secs_f64());
|
||||
self.global_latency_histo
|
||||
.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
|
||||
per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1240,10 +1241,32 @@ pub enum SmgrQueryType {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_metrics: [Histogram; SmgrQueryType::COUNT],
|
||||
per_timeline_getpage: Histogram,
|
||||
global_started: [IntCounter; SmgrQueryType::COUNT],
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
per_timeline_getpage_started: IntCounter,
|
||||
per_timeline_getpage_latency: Histogram,
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
|
||||
"pageserver_smgr_query_started_global_count",
|
||||
"Number of smgr queries started, aggregated by query type.",
|
||||
&["smgr_query_type"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
|
||||
"pageserver_smgr_query_started_count",
|
||||
"Number of smgr queries started, aggregated by query type and tenant/timeline.",
|
||||
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
@@ -1319,14 +1342,20 @@ impl SmgrQueryTimePerTimeline {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let global_metrics = std::array::from_fn(|i| {
|
||||
let global_started = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
SMGR_QUERY_STARTED_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap()
|
||||
});
|
||||
let global_latency = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
SMGR_QUERY_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
&tenant_id,
|
||||
@@ -1334,18 +1363,32 @@ impl SmgrQueryTimePerTimeline {
|
||||
&timeline_id,
|
||||
])
|
||||
.unwrap();
|
||||
let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
&tenant_id,
|
||||
&shard_slug,
|
||||
&timeline_id,
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
global_metrics,
|
||||
per_timeline_getpage,
|
||||
global_started,
|
||||
global_latency,
|
||||
per_timeline_getpage_latency,
|
||||
per_timeline_getpage_started,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_timer<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + '_> {
|
||||
let global_metric = &self.global_metrics[op as usize];
|
||||
) -> Option<impl Drop + 'a> {
|
||||
let start = Instant::now();
|
||||
|
||||
self.global_started[op as usize].inc();
|
||||
|
||||
// We subtract time spent throttled from the observed latency.
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
@@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline {
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||
Some(&self.per_timeline_getpage)
|
||||
let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||
self.per_timeline_getpage_started.inc();
|
||||
Some(&self.per_timeline_getpage_latency)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Some(GlobalAndPerTimelineHistogramTimer {
|
||||
global_metric,
|
||||
timeline_metric,
|
||||
global_latency_histo: &self.global_latency[op as usize],
|
||||
per_timeline_latency_histo,
|
||||
ctx,
|
||||
start,
|
||||
op,
|
||||
@@ -1423,9 +1467,12 @@ mod smgr_query_time_tests {
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.global_metrics[*op as usize].get_sample_count())
|
||||
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
|
||||
.sum();
|
||||
(global, metrics.per_timeline_getpage.get_sample_count())
|
||||
(
|
||||
global,
|
||||
metrics.per_timeline_getpage_latency.get_sample_count(),
|
||||
)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
@@ -1487,7 +1534,7 @@ impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
@@ -2576,6 +2623,12 @@ impl TimelineMetrics {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
|
||||
let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
SmgrQueryType::GetPageAtLsn.into(),
|
||||
tenant_id,
|
||||
@@ -2592,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
}
|
||||
|
||||
tenant_throttling::remove_tenant_metrics(tenant_shard_id);
|
||||
|
||||
// we leave the BROKEN_TENANTS_SET entry if any
|
||||
}
|
||||
|
||||
@@ -3055,41 +3110,173 @@ pub mod tokio_epoll_uring {
|
||||
pub(crate) mod tenant_throttling {
|
||||
use metrics::{register_int_counter_vec, IntCounter};
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::tenant::{self, throttle::Metric};
|
||||
|
||||
pub(crate) struct TimelineGet {
|
||||
wait_time: IntCounter,
|
||||
count: IntCounter,
|
||||
struct GlobalAndPerTenantIntCounter {
|
||||
global: IntCounter,
|
||||
per_tenant: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
|
||||
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum_global",
|
||||
"Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
|
||||
impl GlobalAndPerTenantIntCounter {
|
||||
#[inline(always)]
|
||||
pub(crate) fn inc(&self) {
|
||||
self.inc_by(1)
|
||||
}
|
||||
#[inline(always)]
|
||||
pub(crate) fn inc_by(&self, n: u64) {
|
||||
self.global.inc_by(n);
|
||||
self.per_tenant.inc_by(n);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct TimelineGet {
|
||||
count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
wait_time: GlobalAndPerTenantIntCounter,
|
||||
count_throttled: GlobalAndPerTenantIntCounter,
|
||||
}
|
||||
|
||||
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_start_global",
|
||||
"Count of tenant throttling starts, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_global",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let kind = "timeline_get";
|
||||
TimelineGet {
|
||||
wait_time: WAIT_USECS.with_label_values(&[kind]),
|
||||
count: WAIT_COUNT.with_label_values(&[kind]),
|
||||
}
|
||||
.unwrap()
|
||||
});
|
||||
static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_start",
|
||||
"Count of tenant throttling starts, by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_finish_global",
|
||||
"Count of tenant throttling finishes, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_accounted_finish",
|
||||
"Count of tenant throttling finishes, by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum_global",
|
||||
"Sum of microseconds that spent waiting throttle by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum",
|
||||
"Sum of microseconds that spent waiting throttle by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
impl Metric for &'static TimelineGet {
|
||||
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_global",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind", "tenant_id", "shard_id"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
const KIND: &str = "timeline_get";
|
||||
|
||||
impl TimelineGet {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
|
||||
let per_tenant_label_values = &[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
];
|
||||
TimelineGet {
|
||||
count_accounted_start: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
|
||||
per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
count_accounted_finish: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
|
||||
per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
wait_time: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_USECS.with_label_values(&[KIND]),
|
||||
per_tenant: WAIT_USECS_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
count_throttled: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_COUNT.with_label_values(&[KIND]),
|
||||
per_tenant: WAIT_COUNT_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn preinitialize_global_metrics() {
|
||||
Lazy::force(&COUNT_ACCOUNTED_START);
|
||||
Lazy::force(&COUNT_ACCOUNTED_FINISH);
|
||||
Lazy::force(&WAIT_USECS);
|
||||
Lazy::force(&WAIT_COUNT);
|
||||
}
|
||||
|
||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
for m in &[
|
||||
&COUNT_ACCOUNTED_START_PER_TENANT,
|
||||
&COUNT_ACCOUNTED_FINISH_PER_TENANT,
|
||||
&WAIT_USECS_PER_TENANT,
|
||||
&WAIT_COUNT_PER_TENANT,
|
||||
] {
|
||||
let _ = m.remove_label_values(&[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
impl Metric for TimelineGet {
|
||||
#[inline(always)]
|
||||
fn accounting_start(&self) {
|
||||
self.count_accounted_start.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn accounting_finish(&self) {
|
||||
self.count_accounted_finish.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn observe_throttling(
|
||||
&self,
|
||||
@@ -3097,7 +3284,7 @@ pub(crate) mod tenant_throttling {
|
||||
) {
|
||||
let val = u64::try_from(wait_time.as_micros()).unwrap();
|
||||
self.wait_time.inc_by(val);
|
||||
self.count.inc();
|
||||
self.count_throttled.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3227,11 +3414,14 @@ pub fn preinitialize_metrics() {
|
||||
}
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
[
|
||||
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
|
||||
&SMGR_QUERY_STARTED_GLOBAL,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
@@ -3253,7 +3443,8 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// Custom
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&tenant_throttling::TIMELINE_GET);
|
||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
||||
|
||||
tenant_throttling::preinitialize_global_metrics();
|
||||
}
|
||||
|
||||
@@ -840,6 +840,36 @@ impl Timeline {
|
||||
Ok(total_size * BLCKSZ as u64)
|
||||
}
|
||||
|
||||
/// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
|
||||
/// for gc-compaction.
|
||||
///
|
||||
/// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
|
||||
/// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
|
||||
/// be kept only for a specific range of LSN.
|
||||
///
|
||||
/// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
|
||||
/// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
|
||||
/// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
|
||||
/// determine which keys to retain/drop for gc-compaction.
|
||||
///
|
||||
/// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
|
||||
/// to be retained for each of the branch LSN.
|
||||
///
|
||||
/// The return value is (dense keyspace, sparse keyspace).
|
||||
pub(crate) async fn collect_gc_compaction_keyspace(
|
||||
&self,
|
||||
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
|
||||
let metadata_key_begin = Key::metadata_key_range().start;
|
||||
let aux_v1_key = AUX_FILES_KEY;
|
||||
let dense_keyspace = KeySpace {
|
||||
ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
|
||||
};
|
||||
Ok((
|
||||
dense_keyspace,
|
||||
SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
|
||||
))
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
|
||||
@@ -18,7 +18,6 @@ use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
@@ -34,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
@@ -302,7 +302,7 @@ pub struct Tenant {
|
||||
/// Throttle applied at the top of [`Timeline::get`].
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) timeline_get_throttle:
|
||||
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
|
||||
/// An ongoing timeline detach concurrency limiter.
|
||||
///
|
||||
@@ -563,6 +563,8 @@ pub enum CreateTimelineError {
|
||||
AncestorLsn(anyhow::Error),
|
||||
#[error("ancestor timeline is not active")]
|
||||
AncestorNotActive,
|
||||
#[error("ancestor timeline is archived")]
|
||||
AncestorArchived,
|
||||
#[error("tenant shutting down")]
|
||||
ShuttingDown,
|
||||
#[error(transparent)]
|
||||
@@ -1031,13 +1033,9 @@ impl Tenant {
|
||||
}
|
||||
|
||||
Ok(TenantPreload {
|
||||
timelines: Self::load_timeline_metadata(
|
||||
self,
|
||||
remote_timeline_ids,
|
||||
remote_storage,
|
||||
cancel,
|
||||
)
|
||||
.await?,
|
||||
timelines: self
|
||||
.load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
|
||||
.await?,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1303,7 +1301,7 @@ impl Tenant {
|
||||
.await
|
||||
}
|
||||
|
||||
async fn load_timeline_metadata(
|
||||
async fn load_timelines_metadata(
|
||||
self: &Arc<Tenant>,
|
||||
timeline_ids: HashSet<TimelineId>,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
@@ -1311,33 +1309,10 @@ impl Tenant {
|
||||
) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
|
||||
let mut part_downloads = JoinSet::new();
|
||||
for timeline_id in timeline_ids {
|
||||
let client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.deletion_queue_client.clone(),
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
timeline_id,
|
||||
self.generation,
|
||||
);
|
||||
let cancel_clone = cancel.clone();
|
||||
part_downloads.spawn(
|
||||
async move {
|
||||
debug!("starting index part download");
|
||||
|
||||
let index_part = client.download_index_file(&cancel_clone).await;
|
||||
|
||||
debug!("finished index part download");
|
||||
|
||||
Result::<_, anyhow::Error>::Ok(TimelinePreload {
|
||||
client,
|
||||
timeline_id,
|
||||
index_part,
|
||||
})
|
||||
}
|
||||
.map(move |res| {
|
||||
res.with_context(|| format!("download index part for timeline {timeline_id}"))
|
||||
})
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1348,8 +1323,7 @@ impl Tenant {
|
||||
next = part_downloads.join_next() => {
|
||||
match next {
|
||||
Some(result) => {
|
||||
let preload_result = result.context("join preload task")?;
|
||||
let preload = preload_result?;
|
||||
let preload = result.context("join preload task")?;
|
||||
timeline_preloads.insert(preload.timeline_id, preload);
|
||||
},
|
||||
None => {
|
||||
@@ -1366,6 +1340,36 @@ impl Tenant {
|
||||
Ok(timeline_preloads)
|
||||
}
|
||||
|
||||
fn load_timeline_metadata(
|
||||
self: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
) -> impl Future<Output = TimelinePreload> {
|
||||
let client = RemoteTimelineClient::new(
|
||||
remote_storage.clone(),
|
||||
self.deletion_queue_client.clone(),
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
timeline_id,
|
||||
self.generation,
|
||||
);
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("starting index part download");
|
||||
|
||||
let index_part = client.download_index_file(&cancel).await;
|
||||
|
||||
debug!("finished index part download");
|
||||
|
||||
TimelinePreload {
|
||||
client,
|
||||
timeline_id,
|
||||
index_part,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn apply_timeline_archival_config(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
@@ -1696,6 +1700,11 @@ impl Tenant {
|
||||
return Err(CreateTimelineError::AncestorNotActive);
|
||||
}
|
||||
|
||||
if ancestor_timeline.is_archived() == Some(true) {
|
||||
info!("tried to branch archived timeline");
|
||||
return Err(CreateTimelineError::AncestorArchived);
|
||||
}
|
||||
|
||||
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||
*lsn = lsn.align();
|
||||
|
||||
@@ -1966,9 +1975,6 @@ impl Tenant {
|
||||
TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
|
||||
}
|
||||
TenantState::Loading => {
|
||||
*current_state = TenantState::Activating(ActivatingFrom::Loading);
|
||||
}
|
||||
TenantState::Attaching => {
|
||||
*current_state = TenantState::Activating(ActivatingFrom::Attaching);
|
||||
}
|
||||
@@ -2149,7 +2155,7 @@ impl Tenant {
|
||||
async fn set_stopping(
|
||||
&self,
|
||||
progress: completion::Barrier,
|
||||
allow_transition_from_loading: bool,
|
||||
_allow_transition_from_loading: bool,
|
||||
allow_transition_from_attaching: bool,
|
||||
) -> Result<(), SetStoppingError> {
|
||||
let mut rx = self.state.subscribe();
|
||||
@@ -2164,7 +2170,6 @@ impl Tenant {
|
||||
);
|
||||
false
|
||||
}
|
||||
TenantState::Loading => allow_transition_from_loading,
|
||||
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
|
||||
})
|
||||
.await
|
||||
@@ -2183,13 +2188,6 @@ impl Tenant {
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Loading => {
|
||||
if !allow_transition_from_loading {
|
||||
unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
|
||||
};
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Active => {
|
||||
// FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
|
||||
// are created after the transition to Stopping. That's harmless, as the Timelines
|
||||
@@ -2245,7 +2243,7 @@ impl Tenant {
|
||||
// The load & attach routines own the tenant state until it has reached `Active`.
|
||||
// So, wait until it's done.
|
||||
rx.wait_for(|state| match state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
info!(
|
||||
"waiting for {} to turn Active|Broken|Stopping",
|
||||
<&'static str>::from(state)
|
||||
@@ -2265,7 +2263,7 @@ impl Tenant {
|
||||
let reason = reason.to_string();
|
||||
self.state.send_modify(|current_state| {
|
||||
match *current_state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
}
|
||||
TenantState::Active => {
|
||||
@@ -2309,7 +2307,7 @@ impl Tenant {
|
||||
loop {
|
||||
let current_state = receiver.borrow_and_update().clone();
|
||||
match current_state {
|
||||
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
|
||||
TenantState::Attaching | TenantState::Activating(_) => {
|
||||
// in these states, there's a chance that we can reach ::Active
|
||||
self.activate_now();
|
||||
match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
|
||||
@@ -2831,7 +2829,7 @@ impl Tenant {
|
||||
gate: Gate::default(),
|
||||
timeline_get_throttle: Arc::new(throttle::Throttle::new(
|
||||
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||
crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
@@ -3625,7 +3623,7 @@ impl Tenant {
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
) -> anyhow::Result<UninitializedTimeline<'a>> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let resources = self.build_timeline_resources(new_timeline_id);
|
||||
@@ -4142,7 +4140,7 @@ pub(crate) mod harness {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Loading,
|
||||
TenantState::Attaching,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt::from(self.tenant_conf.clone()),
|
||||
|
||||
@@ -5,6 +5,7 @@ use itertools::Itertools;
|
||||
use super::storage_layer::LayerName;
|
||||
|
||||
/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
|
||||
///
|
||||
/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
|
||||
///
|
||||
/// ```plain
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
//! Common traits and structs for layers
|
||||
|
||||
pub mod delta_layer;
|
||||
pub mod filter_iterator;
|
||||
pub mod image_layer;
|
||||
pub mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
|
||||
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
|
||||
use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -1021,13 +1021,30 @@ impl DeltaLayerInner {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
if Some(meta.meta.key) == ignore_key_with_err {
|
||||
continue;
|
||||
}
|
||||
let blob_read = meta.read(&view).await;
|
||||
let blob_read = match blob_read {
|
||||
Ok(buf) => buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let value = Value::des(&blob_read);
|
||||
|
||||
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
|
||||
let value = match value {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
@@ -1243,21 +1260,21 @@ impl DeltaLayerInner {
|
||||
buf.reserve(read.size());
|
||||
let res = reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let view = BufView::new_slice(&res.buf);
|
||||
|
||||
for blob in res.blobs {
|
||||
let key = blob.meta.key;
|
||||
let lsn = blob.meta.lsn;
|
||||
let data = &res.buf[blob.start..blob.end];
|
||||
|
||||
let data = blob.read(&view).await?;
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
Value::des(data)
|
||||
Value::des(&data)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"blob failed to deserialize for {}@{}, {}..{}: {:?}",
|
||||
blob.meta.key,
|
||||
blob.meta.lsn,
|
||||
blob.start,
|
||||
blob.end,
|
||||
utils::Hex(data)
|
||||
"blob failed to deserialize for {}: {:?}",
|
||||
blob,
|
||||
utils::Hex(&data)
|
||||
)
|
||||
})
|
||||
.unwrap();
|
||||
@@ -1265,15 +1282,15 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = crate::repository::ValueBytes::will_init(data)
|
||||
let will_init = crate::repository::ValueBytes::will_init(&data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
||||
per_blob_copy.clear();
|
||||
per_blob_copy.extend_from_slice(data);
|
||||
per_blob_copy.extend_from_slice(&data);
|
||||
|
||||
let (tmp, res) = writer
|
||||
.put_value_bytes(
|
||||
@@ -1538,8 +1555,11 @@ impl<'a> DeltaLayerIterator<'a> {
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = Value::des(&frozen_buf[meta.start..meta.end])?;
|
||||
let blob_read = meta.read(&view).await?;
|
||||
let value = Value::des(&blob_read)?;
|
||||
|
||||
next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
@@ -1916,9 +1936,13 @@ pub(crate) mod test {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
|
||||
.await?;
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||
let value = meta.read(&view).await?;
|
||||
assert_eq!(
|
||||
&value[..],
|
||||
&entries_meta.index[&(meta.meta.key, meta.meta.lsn)]
|
||||
);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
|
||||
205
pageserver/src/tenant/storage_layer/filter_iterator.rs
Normal file
205
pageserver/src/tenant/storage_layer/filter_iterator.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
keyspace::{KeySpace, SparseKeySpace},
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::Value;
|
||||
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
|
||||
///
|
||||
/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
|
||||
/// to be retained.
|
||||
pub struct FilterIterator<'a> {
|
||||
inner: MergeIterator<'a>,
|
||||
retain_key_filters: Vec<Range<Key>>,
|
||||
current_filter_idx: usize,
|
||||
}
|
||||
|
||||
impl<'a> FilterIterator<'a> {
|
||||
pub fn create(
|
||||
inner: MergeIterator<'a>,
|
||||
dense_keyspace: KeySpace,
|
||||
sparse_keyspace: SparseKeySpace,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut retain_key_filters = Vec::new();
|
||||
retain_key_filters.extend(dense_keyspace.ranges);
|
||||
retain_key_filters.extend(sparse_keyspace.0.ranges);
|
||||
retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
|
||||
// Verify key filters are non-overlapping and sorted
|
||||
for window in retain_key_filters.windows(2) {
|
||||
if window[0].end > window[1].start {
|
||||
bail!(
|
||||
"Key filters are overlapping: {:?} and {:?}",
|
||||
window[0],
|
||||
window[1]
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(Self {
|
||||
inner,
|
||||
retain_key_filters,
|
||||
current_filter_idx: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
while let Some(item) = self.inner.next().await? {
|
||||
while self.current_filter_idx < self.retain_key_filters.len()
|
||||
&& item.0 >= self.retain_key_filters[self.current_filter_idx].end
|
||||
{
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
self.current_filter_idx += 1;
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
if self.current_filter_idx >= self.retain_key_filters.len() {
|
||||
// We already exhausted all filters, so we should return now
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter (nothing)
|
||||
return Ok(None);
|
||||
}
|
||||
if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
return Ok(Some(item));
|
||||
}
|
||||
// If the key is not contained in the key retaining filters, continue to the next item.
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::produce_delta_layer,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
async fn assert_filter_iter_equal(
|
||||
filter_iter: &mut FilterIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = filter_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
assert_eq!(o1.is_some(), o2.is_some());
|
||||
if o1.is_none() && o2.is_none() {
|
||||
break;
|
||||
}
|
||||
let (k1, l1, v1) = o1.unwrap();
|
||||
let (k2, l2, v2) = o2.unwrap();
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, *l2);
|
||||
assert_eq!(&v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 100;
|
||||
let test_deltas1 = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32),
|
||||
Lsn(0x20 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(5)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(110),
|
||||
get_key(1000)..get_key(2000),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[5..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..100].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(0)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(95),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[0..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..95].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
}
|
||||
}
|
||||
@@ -36,7 +36,8 @@ use crate::tenant::disk_btree::{
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
@@ -547,15 +548,15 @@ impl ImageLayerInner {
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
let img_buf = meta.read(&view).await?;
|
||||
|
||||
key_count += 1;
|
||||
writer
|
||||
.put_image(meta.meta.key, img_buf, ctx)
|
||||
.put_image(meta.meta.key, img_buf.into_bytes(), ctx)
|
||||
.await
|
||||
.context(format!("Storing key {}", meta.meta.key))?;
|
||||
}
|
||||
@@ -602,13 +603,28 @@ impl ImageLayerInner {
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
let img_buf = meta.read(&view).await;
|
||||
|
||||
let img_buf = match img_buf {
|
||||
Ok(img_buf) => img_buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
reconstruct_state.update_key(
|
||||
&meta.meta.key,
|
||||
self.lsn,
|
||||
Value::Image(img_buf),
|
||||
Value::Image(img_buf.into_bytes()),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1025,10 +1041,15 @@ impl<'a> ImageLayerIterator<'a> {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf: Bytes = blobs_buf.buf.freeze();
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
|
||||
let img_buf = meta.read(&view).await?;
|
||||
next_batch.push_back((
|
||||
meta.meta.key,
|
||||
self.image_layer.lsn,
|
||||
Value::Image(img_buf.into_bytes()),
|
||||
));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
Ok(())
|
||||
|
||||
@@ -439,11 +439,30 @@ impl Layer {
|
||||
|
||||
fn record_access(&self, ctx: &RequestContext) {
|
||||
if self.0.access_stats.record_access(ctx) {
|
||||
// Visibility was modified to Visible
|
||||
tracing::info!(
|
||||
"Layer {} became visible as a result of access",
|
||||
self.0.desc.key()
|
||||
);
|
||||
// Visibility was modified to Visible: maybe log about this
|
||||
match ctx.task_kind() {
|
||||
TaskKind::CalculateSyntheticSize
|
||||
| TaskKind::GarbageCollector
|
||||
| TaskKind::MgmtRequest => {
|
||||
// This situation is expected in code paths do binary searches of the LSN space to resolve
|
||||
// an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
|
||||
// and on-demand for certain HTTP API requests.
|
||||
}
|
||||
_ => {
|
||||
// In all other contexts, it is unusual to do I/O involving layers which are not visible at
|
||||
// some branch tip, so we log the fact that we are accessing something that the visibility
|
||||
// calculation thought should not be visible.
|
||||
//
|
||||
// This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
|
||||
// which was covered by a concurrent compaction.
|
||||
tracing::info!(
|
||||
"Layer {} became visible as a result of access",
|
||||
self.0.desc.key()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Update the timeline's visible bytes count
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
|
||||
@@ -1025,6 +1025,15 @@ fn access_stats() {
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
|
||||
|
||||
// Recording access implicitly makes layer visible, if it wasn't already
|
||||
let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
|
||||
access_stats.set_visibility(LayerVisibilityHint::Covered);
|
||||
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
|
||||
assert!(access_stats.record_access_at(atime));
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
assert!(!access_stats.record_access_at(atime));
|
||||
access_stats.set_visibility(LayerVisibilityHint::Visible);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -163,8 +163,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
// How many errors we have seen consequtively
|
||||
let mut error_run_count = 0;
|
||||
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
@@ -191,8 +189,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
@@ -207,12 +203,18 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
};
|
||||
|
||||
// Run compaction
|
||||
let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
|
||||
let IterationResult { output, elapsed } = iteration
|
||||
.run(tenant.compaction_iteration(&cancel, &ctx))
|
||||
.await;
|
||||
match output {
|
||||
Ok(has_pending_task) => {
|
||||
error_run_count = 0;
|
||||
// schedule the next compaction immediately in case there is a pending compaction task
|
||||
sleep_duration = if has_pending_task { Duration::ZERO } else { period };
|
||||
sleep_duration = if has_pending_task {
|
||||
Duration::ZERO
|
||||
} else {
|
||||
period
|
||||
};
|
||||
}
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
@@ -233,38 +235,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
|
||||
// the duration is recorded by performance tests by enabling debug in this function
|
||||
tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
|
||||
tracing::debug!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"compaction iteration complete"
|
||||
);
|
||||
};
|
||||
|
||||
|
||||
// Perhaps we did no work and the walredo process has been idle for some time:
|
||||
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
||||
// TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
|
||||
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
|
||||
if let Some(walredo_mgr) = &tenant.walredo_mgr {
|
||||
walredo_mgr.maybe_quiesce(period * 10);
|
||||
}
|
||||
|
||||
// TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
|
||||
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
|
||||
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}",
|
||||
delta.as_secs_f64()),
|
||||
count_accounted,
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
@@ -437,6 +421,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
@@ -483,6 +468,28 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
|
||||
kind: BackgroundLoopKind::IngestHouseKeeping,
|
||||
};
|
||||
iteration.run(tenant.ingest_housekeeping()).await;
|
||||
|
||||
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
|
||||
// Or just spawn another background loop for this throttle, it's not like it's super costly.
|
||||
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
count_accounted = count_accounted_finish, // don't break existing log scraping
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
count_accounted_start, // log after pre-existing fields to not break existing log scraping
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
.await;
|
||||
|
||||
@@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
|
||||
pub struct Throttle<M: Metric> {
|
||||
inner: ArcSwap<Inner>,
|
||||
metric: M,
|
||||
/// will be turned into [`Stats::count_accounted`]
|
||||
count_accounted: AtomicU64,
|
||||
/// will be turned into [`Stats::count_accounted_start`]
|
||||
count_accounted_start: AtomicU64,
|
||||
/// will be turned into [`Stats::count_accounted_finish`]
|
||||
count_accounted_finish: AtomicU64,
|
||||
/// will be turned into [`Stats::count_throttled`]
|
||||
count_throttled: AtomicU64,
|
||||
/// will be turned into [`Stats::sum_throttled_usecs`]
|
||||
@@ -43,17 +45,21 @@ pub struct Observation {
|
||||
pub wait_time: Duration,
|
||||
}
|
||||
pub trait Metric {
|
||||
fn accounting_start(&self);
|
||||
fn accounting_finish(&self);
|
||||
fn observe_throttling(&self, observation: &Observation);
|
||||
}
|
||||
|
||||
/// See [`Throttle::reset_stats`].
|
||||
pub struct Stats {
|
||||
// Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
|
||||
pub count_accounted: u64,
|
||||
// Subset of the `accounted` requests that were actually throttled.
|
||||
// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
|
||||
/// Number of requests that started [`Throttle::throttle`] calls.
|
||||
pub count_accounted_start: u64,
|
||||
/// Number of requests that finished [`Throttle::throttle`] calls.
|
||||
pub count_accounted_finish: u64,
|
||||
/// Subset of the `accounted` requests that were actually throttled.
|
||||
/// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
|
||||
pub count_throttled: u64,
|
||||
// Sum of microseconds that throttled requests spent waiting for throttling.
|
||||
/// Sum of microseconds that throttled requests spent waiting for throttling.
|
||||
pub sum_throttled_usecs: u64,
|
||||
}
|
||||
|
||||
@@ -65,7 +71,8 @@ where
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
|
||||
metric,
|
||||
count_accounted: AtomicU64::new(0),
|
||||
count_accounted_start: AtomicU64::new(0),
|
||||
count_accounted_finish: AtomicU64::new(0),
|
||||
count_throttled: AtomicU64::new(0),
|
||||
sum_throttled_usecs: AtomicU64::new(0),
|
||||
}
|
||||
@@ -117,11 +124,13 @@ where
|
||||
/// This method allows retrieving & resetting that flag.
|
||||
/// Useful for periodic reporting.
|
||||
pub fn reset_stats(&self) -> Stats {
|
||||
let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
|
||||
let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
|
||||
let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
|
||||
let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
|
||||
let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
|
||||
Stats {
|
||||
count_accounted,
|
||||
count_accounted_start,
|
||||
count_accounted_finish,
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
}
|
||||
@@ -139,9 +148,12 @@ where
|
||||
};
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
self.metric.accounting_start();
|
||||
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
|
||||
let did_throttle = inner.rate_limiter.acquire(key_count).await;
|
||||
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
|
||||
self.metric.accounting_finish();
|
||||
|
||||
self.count_accounted.fetch_add(1, Ordering::Relaxed);
|
||||
if did_throttle {
|
||||
self.count_throttled.fetch_add(1, Ordering::Relaxed);
|
||||
let now = Instant::now();
|
||||
|
||||
@@ -112,7 +112,7 @@ use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE};
|
||||
use utils::{
|
||||
completion,
|
||||
generation::Generation,
|
||||
@@ -196,9 +196,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub timeline_get_throttle: Arc<
|
||||
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
||||
>,
|
||||
pub timeline_get_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -406,9 +405,8 @@ pub struct Timeline {
|
||||
gc_lock: tokio::sync::Mutex<()>,
|
||||
|
||||
/// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
|
||||
timeline_get_throttle: Arc<
|
||||
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
||||
>,
|
||||
timeline_get_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
|
||||
/// Keep aux directory cache to avoid it's reconstruction on each update
|
||||
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
|
||||
@@ -1339,6 +1337,10 @@ impl Timeline {
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
let lease = {
|
||||
// Normalize the requested LSN to be aligned, and move to the first record
|
||||
// if it points to the beginning of the page (header).
|
||||
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
|
||||
|
||||
let mut gc_info = self.gc_info.write().unwrap();
|
||||
|
||||
let valid_until = SystemTime::now() + length;
|
||||
@@ -3599,7 +3601,7 @@ impl Timeline {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?;
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
@@ -3838,16 +3840,20 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
anyhow::bail!("repartition() called concurrently, this should not happen");
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called concurrently, this should not happen"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
if lsn < *partition_lsn {
|
||||
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
)));
|
||||
}
|
||||
|
||||
let distance = lsn.0 - partition_lsn.0;
|
||||
@@ -4316,7 +4322,9 @@ impl Timeline {
|
||||
timer.stop_and_record();
|
||||
|
||||
// Creating image layers may have caused some previously visible layers to be covered
|
||||
self.update_layer_visibility().await?;
|
||||
if !image_layers.is_empty() {
|
||||
self.update_layer_visibility().await?;
|
||||
}
|
||||
|
||||
Ok(image_layers)
|
||||
}
|
||||
@@ -4447,6 +4455,12 @@ pub(crate) enum CompactionError {
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
pub fn is_cancelled(&self) -> bool {
|
||||
matches!(self, CompactionError::ShuttingDown)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CollectKeySpaceError> for CompactionError {
|
||||
fn from(err: CollectKeySpaceError) -> Self {
|
||||
match err {
|
||||
|
||||
@@ -31,6 +31,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
|
||||
use crate::page_cache;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::filter_iterator::FilterIterator;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||
@@ -389,7 +390,7 @@ impl Timeline {
|
||||
// error but continue.
|
||||
//
|
||||
// Suppress error when it's due to cancellation
|
||||
if !self.cancel.is_cancelled() {
|
||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
(1, false)
|
||||
@@ -1772,6 +1773,7 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
lowest_retain_lsn
|
||||
);
|
||||
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
@@ -1820,7 +1822,12 @@ impl Timeline {
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
|
||||
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
|
||||
let mut merge_iter = FilterIterator::create(
|
||||
MergeIterator::create(&delta_layers, &image_layers, ctx),
|
||||
dense_ks,
|
||||
sparse_ks,
|
||||
)?;
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
|
||||
@@ -30,8 +30,8 @@ use crate::{
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
|
||||
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -557,6 +557,8 @@ impl Timeline {
|
||||
gather_result = gather => {
|
||||
match gather_result {
|
||||
Ok(_) => {},
|
||||
// It can happen sometimes that we hit this instead of the cancellation token firing above
|
||||
Err(CalculateSyntheticSizeError::Cancelled) => {}
|
||||
Err(e) => {
|
||||
// We don't care about the result, but, if it failed, we should log it,
|
||||
// since consumption metric might be hitting the cached value and
|
||||
|
||||
@@ -16,8 +16,9 @@
|
||||
//! Note that the vectored blob api does *not* go through the page cache.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
@@ -35,11 +36,123 @@ pub struct BlobMeta {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]
|
||||
/// A view into the vectored blobs read buffer.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum BufView<'a> {
|
||||
Slice(&'a [u8]),
|
||||
Bytes(bytes::Bytes),
|
||||
}
|
||||
|
||||
impl<'a> BufView<'a> {
|
||||
/// Creates a new slice-based view on the blob.
|
||||
pub fn new_slice(slice: &'a [u8]) -> Self {
|
||||
Self::Slice(slice)
|
||||
}
|
||||
|
||||
/// Creates a new [`bytes::Bytes`]-based view on the blob.
|
||||
pub fn new_bytes(bytes: bytes::Bytes) -> Self {
|
||||
Self::Bytes(bytes)
|
||||
}
|
||||
|
||||
/// Convert the view into `Bytes`.
|
||||
///
|
||||
/// If using slice as the underlying storage, the copy will be an O(n) operation.
|
||||
pub fn into_bytes(self) -> Bytes {
|
||||
match self {
|
||||
BufView::Slice(slice) => Bytes::copy_from_slice(slice),
|
||||
BufView::Bytes(bytes) => bytes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a sub-view of the blob based on the range.
|
||||
fn view(&self, range: std::ops::Range<usize>) -> Self {
|
||||
match self {
|
||||
BufView::Slice(slice) => BufView::Slice(&slice[range]),
|
||||
BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BufView<'a> {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
BufView::Slice(slice) => slice,
|
||||
BufView::Bytes(bytes) => bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> AsRef<[u8]> for BufView<'a> {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
BufView::Slice(slice) => slice,
|
||||
BufView::Bytes(bytes) => bytes.as_ref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for BufView<'a> {
|
||||
fn from(value: &'a [u8]) -> Self {
|
||||
Self::new_slice(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Bytes> for BufView<'_> {
|
||||
fn from(value: Bytes) -> Self {
|
||||
Self::new_bytes(value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed,
|
||||
/// subject to [`VectoredBlob::compression_bits`].
|
||||
pub struct VectoredBlob {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
/// Blob metadata.
|
||||
pub meta: BlobMeta,
|
||||
/// Start offset.
|
||||
start: usize,
|
||||
/// End offset.
|
||||
end: usize,
|
||||
/// Compression used on the the blob.
|
||||
compression_bits: u8,
|
||||
}
|
||||
|
||||
impl VectoredBlob {
|
||||
/// Reads a decompressed view of the blob.
|
||||
pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
|
||||
let view = buf.view(self.start..self.end);
|
||||
|
||||
match self.compression_bits {
|
||||
BYTE_UNCOMPRESSED => Ok(view),
|
||||
BYTE_ZSTD => {
|
||||
let mut decompressed_vec = Vec::new();
|
||||
let mut decoder =
|
||||
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||
decoder.write_all(&view).await?;
|
||||
decoder.flush().await?;
|
||||
// Zero-copy conversion from `Vec` to `Bytes`
|
||||
Ok(BufView::new_bytes(Bytes::from(decompressed_vec)))
|
||||
}
|
||||
bits => {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
|
||||
);
|
||||
Err(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for VectoredBlob {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}@{}, {}..{}",
|
||||
self.meta.key, self.meta.lsn, self.start, self.end
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Return type of [`VectoredBlobReader::read_blobs`]
|
||||
@@ -514,7 +627,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
);
|
||||
}
|
||||
|
||||
let mut buf = self
|
||||
let buf = self
|
||||
.file
|
||||
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
||||
.await?
|
||||
@@ -529,9 +642,6 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
// of a blob is implicit: the start of the next blob if one exists
|
||||
// or the end of the read.
|
||||
|
||||
// Some scratch space, put here for reusing the allocation
|
||||
let mut decompressed_vec = Vec::new();
|
||||
|
||||
for (blob_start, meta) in blobs_at {
|
||||
let blob_start_in_buf = blob_start - start_offset;
|
||||
let first_len_byte = buf[blob_start_in_buf as usize];
|
||||
@@ -557,35 +667,14 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
)
|
||||
};
|
||||
|
||||
let start_raw = blob_start_in_buf + size_length;
|
||||
let end_raw = start_raw + blob_size;
|
||||
let (start, end);
|
||||
if compression_bits == BYTE_UNCOMPRESSED {
|
||||
start = start_raw as usize;
|
||||
end = end_raw as usize;
|
||||
} else if compression_bits == BYTE_ZSTD {
|
||||
let mut decoder =
|
||||
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||
decoder
|
||||
.write_all(&buf[start_raw as usize..end_raw as usize])
|
||||
.await?;
|
||||
decoder.flush().await?;
|
||||
start = buf.len();
|
||||
buf.extend_from_slice(&decompressed_vec);
|
||||
end = buf.len();
|
||||
decompressed_vec.clear();
|
||||
} else {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("invalid compression byte {compression_bits:x}"),
|
||||
);
|
||||
return Err(error);
|
||||
}
|
||||
let start = (blob_start_in_buf + size_length) as usize;
|
||||
let end = start + blob_size as usize;
|
||||
|
||||
metas.push(VectoredBlob {
|
||||
start,
|
||||
end,
|
||||
meta: *meta,
|
||||
compression_bits,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1020,8 +1109,13 @@ mod tests {
|
||||
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
|
||||
assert_eq!(result.blobs.len(), 1);
|
||||
let read_blob = &result.blobs[0];
|
||||
let read_buf = &result.buf[read_blob.start..read_blob.end];
|
||||
assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
|
||||
let view = BufView::new_slice(&result.buf);
|
||||
let read_buf = read_blob.read(&view).await?;
|
||||
assert_eq!(
|
||||
&blob[..],
|
||||
&read_buf[..],
|
||||
"mismatch for idx={idx} at offset={offset}"
|
||||
);
|
||||
buf = result.buf;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -205,6 +205,22 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a ping request-response roundtrip.
|
||||
///
|
||||
/// Not used in production, but by Rust benchmarks.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
|
||||
self.do_with_walredo_process(pg_version, |proc| async move {
|
||||
proc.ping(Duration::from_secs(1))
|
||||
.await
|
||||
.map_err(Error::Other)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn status(&self) -> WalRedoManagerStatus {
|
||||
WalRedoManagerStatus {
|
||||
last_redo_at: {
|
||||
@@ -297,6 +313,9 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancel-safe iff `closure` is cancel-safe.
|
||||
async fn do_with_walredo_process<
|
||||
F: FnOnce(Arc<Process>) -> Fut,
|
||||
Fut: Future<Output = Result<O, Error>>,
|
||||
@@ -537,6 +556,17 @@ mod tests {
|
||||
use tracing::Instrument;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ping() {
|
||||
let h = RedoHarness::new().unwrap();
|
||||
|
||||
h.manager
|
||||
.ping(14)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
.expect("ping should work");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn short_v14_redo() {
|
||||
let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
|
||||
|
||||
@@ -6,6 +6,7 @@ use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
page_cache::PAGE_SZ,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
@@ -237,6 +238,26 @@ impl WalRedoProcess {
|
||||
res
|
||||
}
|
||||
|
||||
/// Do a ping request-response roundtrip.
|
||||
///
|
||||
/// Not used in production, but by Rust benchmarks.
|
||||
pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity(4);
|
||||
protocol::build_ping_msg(&mut writebuf);
|
||||
let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo ping timed out");
|
||||
};
|
||||
let response = res?;
|
||||
if response.len() != PAGE_SZ {
|
||||
anyhow::bail!(
|
||||
"WAL redo ping response should respond with page-sized response: {}",
|
||||
response.len()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
|
||||
@@ -55,3 +55,8 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
|
||||
buf.put_u8(b'H');
|
||||
buf.put_u32(4);
|
||||
}
|
||||
|
||||
@@ -9,6 +9,8 @@ OBJS = \
|
||||
hll.o \
|
||||
libpagestore.o \
|
||||
neon.o \
|
||||
neon_pgversioncompat.o \
|
||||
neon_perf_counters.o \
|
||||
neon_utils.o \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
@@ -23,7 +25,18 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql
|
||||
DATA = \
|
||||
neon--1.0.sql \
|
||||
neon--1.0--1.1.sql \
|
||||
neon--1.1--1.2.sql \
|
||||
neon--1.2--1.3.sql \
|
||||
neon--1.3--1.4.sql \
|
||||
neon--1.4--1.5.sql \
|
||||
neon--1.5--1.4.sql \
|
||||
neon--1.4--1.3.sql \
|
||||
neon--1.3--1.2.sql \
|
||||
neon--1.2--1.1.sql \
|
||||
neon--1.1--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -109,6 +109,7 @@ typedef struct FileCacheControl
|
||||
* reenabling */
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
uint32 used_pages; /* number of used pages */
|
||||
uint32 limit; /* shared copy of lfc_size_limit */
|
||||
uint64 hits;
|
||||
uint64 misses;
|
||||
@@ -905,6 +906,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
|
||||
}
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
@@ -959,6 +964,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
for (int i = 0; i < blocks_in_chunk; i++)
|
||||
{
|
||||
lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
|
||||
entry->bitmap[(chunk_offs + i) >> 5] |=
|
||||
(1 << ((chunk_offs + i) & 31));
|
||||
}
|
||||
@@ -1051,6 +1057,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
case 5:
|
||||
key = "file_cache_used_pages";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->used_pages;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_utils.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "walproposer.h"
|
||||
@@ -331,6 +332,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
|
||||
}
|
||||
if (shard->conn)
|
||||
{
|
||||
MyNeonCounters->pageserver_disconnects_total++;
|
||||
PQfinish(shard->conn);
|
||||
shard->conn = NULL;
|
||||
}
|
||||
@@ -737,6 +739,8 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn;
|
||||
|
||||
MyNeonCounters->pageserver_requests_sent_total++;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
@@ -889,6 +893,7 @@ pageserver_flush(shardno_t shard_no)
|
||||
}
|
||||
else
|
||||
{
|
||||
MyNeonCounters->pageserver_send_flushes_total++;
|
||||
if (PQflush(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
@@ -922,7 +927,7 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
static Size
|
||||
PagestoreShmemSize(void)
|
||||
{
|
||||
return sizeof(PagestoreShmemState);
|
||||
return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -941,6 +946,9 @@ PagestoreShmemInit(void)
|
||||
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
|
||||
NeonPerfCountersShmemInit();
|
||||
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
return found;
|
||||
}
|
||||
|
||||
39
pgxn/neon/neon--1.4--1.5.sql
Normal file
39
pgxn/neon/neon--1.4--1.5.sql
Normal file
@@ -0,0 +1,39 @@
|
||||
\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
|
||||
|
||||
|
||||
CREATE FUNCTION get_backend_perf_counters()
|
||||
RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'neon_get_backend_perf_counters'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION get_perf_counters()
|
||||
RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'neon_get_perf_counters'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
-- Show various metrics, for each backend. Note that the values are not reset
|
||||
-- when a backend exits. When a new backend starts with the backend ID, it will
|
||||
-- continue accumulating the values from where the old backend left. If you are
|
||||
-- only interested in the changes from your own session, store the values at the
|
||||
-- beginning of the session somewhere, and subtract them on subsequent calls.
|
||||
--
|
||||
-- For histograms, 'bucket_le' is the upper bound of the histogram bucket.
|
||||
CREATE VIEW neon_backend_perf_counters AS
|
||||
SELECT P.procno, P.pid, P.metric, P.bucket_le, P.value
|
||||
FROM get_backend_perf_counters() AS P (
|
||||
procno integer,
|
||||
pid integer,
|
||||
metric text,
|
||||
bucket_le float8,
|
||||
value float8
|
||||
);
|
||||
|
||||
-- Summary across all backends. (This could also be implemented with
|
||||
-- an aggregate query over neon_backend_perf_counters view.)
|
||||
CREATE VIEW neon_perf_counters AS
|
||||
SELECT P.metric, P.bucket_le, P.value
|
||||
FROM get_perf_counters() AS P (
|
||||
metric text,
|
||||
bucket_le float8,
|
||||
value float8
|
||||
);
|
||||
4
pgxn/neon/neon--1.5--1.4.sql
Normal file
4
pgxn/neon/neon--1.5--1.4.sql
Normal file
@@ -0,0 +1,4 @@
|
||||
DROP VIEW IF EXISTS neon_perf_counters;
|
||||
DROP VIEW IF EXISTS neon_backend_perf_counters;
|
||||
DROP FUNCTION IF EXISTS get_perf_counters();
|
||||
DROP FUNCTION IF EXISTS get_backend_perf_counters();
|
||||
@@ -1,5 +1,7 @@
|
||||
# neon extension
|
||||
comment = 'cloud storage for PostgreSQL'
|
||||
# TODO: bump default version to 1.5, after we are certain that we don't
|
||||
# need to rollback the compute image
|
||||
default_version = '1.4'
|
||||
module_pathname = '$libdir/neon'
|
||||
relocatable = true
|
||||
|
||||
261
pgxn/neon/neon_perf_counters.c
Normal file
261
pgxn/neon/neon_perf_counters.c
Normal file
@@ -0,0 +1,261 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neon_perf_counters.c
|
||||
* Collect statistics about Neon I/O
|
||||
*
|
||||
* Each backend has its own set of counters in shared memory.
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "storage/proc.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "utils/builtins.h"
|
||||
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
neon_per_backend_counters *neon_per_backend_counters_shared;
|
||||
|
||||
Size
|
||||
NeonPerfCountersShmemSize(void)
|
||||
{
|
||||
Size size = 0;
|
||||
|
||||
size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters)));
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
void
|
||||
NeonPerfCountersShmemInit(void)
|
||||
{
|
||||
bool found;
|
||||
|
||||
neon_per_backend_counters_shared =
|
||||
ShmemInitStruct("Neon perf counters",
|
||||
mul_size(MaxBackends,
|
||||
sizeof(neon_per_backend_counters)),
|
||||
&found);
|
||||
Assert(found == IsUnderPostmaster);
|
||||
if (!found)
|
||||
{
|
||||
/* shared memory is initialized to zeros, so nothing to do here */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Count a GetPage wait operation.
|
||||
*/
|
||||
void
|
||||
inc_getpage_wait(uint64 latency_us)
|
||||
{
|
||||
int lo = 0;
|
||||
int hi = NUM_GETPAGE_WAIT_BUCKETS - 1;
|
||||
|
||||
/* Find the right bucket with binary search */
|
||||
while (lo < hi)
|
||||
{
|
||||
int mid = (lo + hi) / 2;
|
||||
|
||||
if (latency_us < getpage_wait_bucket_thresholds[mid])
|
||||
hi = mid;
|
||||
else
|
||||
lo = mid + 1;
|
||||
}
|
||||
MyNeonCounters->getpage_wait_us_bucket[lo]++;
|
||||
MyNeonCounters->getpage_wait_us_sum += latency_us;
|
||||
MyNeonCounters->getpage_wait_us_count++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Support functions for the views, neon_backend_perf_counters and
|
||||
* neon_perf_counters.
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char *name;
|
||||
bool is_bucket;
|
||||
double bucket_le;
|
||||
double value;
|
||||
} metric_t;
|
||||
|
||||
static metric_t *
|
||||
neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
|
||||
{
|
||||
#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8)
|
||||
metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
|
||||
uint64 bucket_accum;
|
||||
int i = 0;
|
||||
Datum getpage_wait_str;
|
||||
|
||||
metrics[i].name = "getpage_wait_seconds_count";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_wait_us_count;
|
||||
i++;
|
||||
metrics[i].name = "getpage_wait_seconds_sum";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0;
|
||||
i++;
|
||||
|
||||
bucket_accum = 0;
|
||||
for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
|
||||
{
|
||||
uint64 threshold = getpage_wait_bucket_thresholds[bucketno];
|
||||
|
||||
bucket_accum += counters->getpage_wait_us_bucket[bucketno];
|
||||
|
||||
metrics[i].name = "getpage_wait_seconds_bucket";
|
||||
metrics[i].is_bucket = true;
|
||||
metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
|
||||
metrics[i].value = (double) bucket_accum;
|
||||
i++;
|
||||
}
|
||||
metrics[i].name = "getpage_prefetch_requests_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_prefetch_requests_total;
|
||||
i++;
|
||||
metrics[i].name = "getpage_sync_requests_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_sync_requests_total;
|
||||
i++;
|
||||
metrics[i].name = "getpage_prefetch_misses_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_prefetch_misses_total;
|
||||
i++;
|
||||
metrics[i].name = "getpage_prefetch_discards_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_prefetch_discards_total;
|
||||
i++;
|
||||
metrics[i].name = "pageserver_requests_sent_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->pageserver_requests_sent_total;
|
||||
i++;
|
||||
metrics[i].name = "pageserver_requests_disconnects_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->pageserver_disconnects_total;
|
||||
i++;
|
||||
metrics[i].name = "pageserver_send_flushes_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->pageserver_send_flushes_total;
|
||||
i++;
|
||||
metrics[i].name = "file_cache_hits_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->file_cache_hits_total;
|
||||
i++;
|
||||
|
||||
Assert(i == NUM_METRICS);
|
||||
|
||||
/* NULL entry marks end of array */
|
||||
metrics[i].name = NULL;
|
||||
metrics[i].value = 0;
|
||||
|
||||
return metrics;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write metric to three output Datums
|
||||
*/
|
||||
static void
|
||||
metric_to_datums(metric_t *m, Datum *values, bool *nulls)
|
||||
{
|
||||
values[0] = CStringGetTextDatum(m->name);
|
||||
nulls[0] = false;
|
||||
if (m->is_bucket)
|
||||
{
|
||||
values[1] = Float8GetDatum(m->bucket_le);
|
||||
nulls[1] = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
values[1] = (Datum) 0;
|
||||
nulls[1] = true;
|
||||
}
|
||||
values[2] = Float8GetDatum(m->value);
|
||||
nulls[2] = false;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_backend_perf_counters);
|
||||
Datum
|
||||
neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
Datum values[5];
|
||||
bool nulls[5];
|
||||
|
||||
/* We put all the tuples into a tuplestore in one go. */
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
for (int procno = 0; procno < MaxBackends; procno++)
|
||||
{
|
||||
PGPROC *proc = GetPGProcByNumber(procno);
|
||||
int pid = proc->pid;
|
||||
neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
|
||||
metric_t *metrics = neon_perf_counters_to_metrics(counters);
|
||||
|
||||
values[0] = Int32GetDatum(procno);
|
||||
nulls[0] = false;
|
||||
values[1] = Int32GetDatum(pid);
|
||||
nulls[1] = false;
|
||||
|
||||
for (int i = 0; metrics[i].name != NULL; i++)
|
||||
{
|
||||
metric_to_datums(&metrics[i], &values[2], &nulls[2]);
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
|
||||
pfree(metrics);
|
||||
}
|
||||
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_perf_counters);
|
||||
Datum
|
||||
neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
Datum values[3];
|
||||
bool nulls[3];
|
||||
Datum getpage_wait_str;
|
||||
neon_per_backend_counters totals = {0};
|
||||
metric_t *metrics;
|
||||
|
||||
/* We put all the tuples into a tuplestore in one go. */
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
/* Aggregate the counters across all backends */
|
||||
for (int procno = 0; procno < MaxBackends; procno++)
|
||||
{
|
||||
neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
|
||||
|
||||
totals.getpage_wait_us_count += counters->getpage_wait_us_count;
|
||||
totals.getpage_wait_us_sum += counters->getpage_wait_us_sum;
|
||||
for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
|
||||
totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno];
|
||||
totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
|
||||
totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
|
||||
totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
|
||||
totals.getpage_prefetch_discards_total += counters->getpage_prefetch_discards_total;
|
||||
totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total;
|
||||
totals.pageserver_disconnects_total += counters->pageserver_disconnects_total;
|
||||
totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total;
|
||||
totals.file_cache_hits_total += counters->file_cache_hits_total;
|
||||
}
|
||||
|
||||
metrics = neon_perf_counters_to_metrics(&totals);
|
||||
for (int i = 0; metrics[i].name != NULL; i++)
|
||||
{
|
||||
metric_to_datums(&metrics[i], &values[0], &nulls[0]);
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
pfree(metrics);
|
||||
|
||||
return (Datum) 0;
|
||||
}
|
||||
111
pgxn/neon/neon_perf_counters.h
Normal file
111
pgxn/neon/neon_perf_counters.h
Normal file
@@ -0,0 +1,111 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neon_perf_counters.h
|
||||
* Performance counters for neon storage requests
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef NEON_PERF_COUNTERS_H
|
||||
#define NEON_PERF_COUNTERS_H
|
||||
|
||||
#if PG_VERSION_NUM >= 170000
|
||||
#include "storage/procnumber.h"
|
||||
#else
|
||||
#include "storage/backendid.h"
|
||||
#include "storage/proc.h"
|
||||
#endif
|
||||
|
||||
static const uint64 getpage_wait_bucket_thresholds[] = {
|
||||
20, 30, 60, 100, /* 0 - 100 us */
|
||||
200, 300, 600, 1000, /* 100 us - 1 ms */
|
||||
2000, 3000, 6000, 10000, /* 1 ms - 10 ms */
|
||||
20000, 30000, 60000, 100000, /* 10 ms - 100 ms */
|
||||
200000, 300000, 600000, 1000000, /* 100 ms - 1 s */
|
||||
2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */
|
||||
20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */
|
||||
UINT64_MAX,
|
||||
};
|
||||
#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds))
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/*
|
||||
* Histogram for how long an smgrread() request needs to wait for response
|
||||
* from pageserver. When prefetching is effective, these wait times can be
|
||||
* lower than the network latency to the pageserver, even zero, if the
|
||||
* page is already readily prefetched whenever we need to read a page.
|
||||
*
|
||||
* Note: we accumulate these in microseconds, because that's convenient in
|
||||
* the backend, but the 'neon_backend_perf_counters' view will convert
|
||||
* them to seconds, to make them more idiomatic as prometheus metrics.
|
||||
*/
|
||||
uint64 getpage_wait_us_count;
|
||||
uint64 getpage_wait_us_sum;
|
||||
uint64 getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS];
|
||||
|
||||
/*
|
||||
* Total number of speculative prefetch Getpage requests and synchronous
|
||||
* GetPage requests sent.
|
||||
*/
|
||||
uint64 getpage_prefetch_requests_total;
|
||||
uint64 getpage_sync_requests_total;
|
||||
|
||||
/* XXX: It's not clear to me when these misses happen. */
|
||||
uint64 getpage_prefetch_misses_total;
|
||||
|
||||
/*
|
||||
* Number of prefetched responses that were discarded becuase the
|
||||
* prefetched page was not needed or because it was concurrently fetched /
|
||||
* modified by another backend.
|
||||
*/
|
||||
uint64 getpage_prefetch_discards_total;
|
||||
|
||||
/*
|
||||
* Total number of requests send to pageserver. (prefetch_requests_total
|
||||
* and sync_request_total count only GetPage requests, this counts all
|
||||
* request types.)
|
||||
*/
|
||||
uint64 pageserver_requests_sent_total;
|
||||
|
||||
/*
|
||||
* Number of times the connection to the pageserver was lost and the
|
||||
* backend had to reconnect. Note that this doesn't count the first
|
||||
* connection in each backend, only reconnects.
|
||||
*/
|
||||
uint64 pageserver_disconnects_total;
|
||||
|
||||
/*
|
||||
* Number of network flushes to the pageserver. Synchronous requests are
|
||||
* flushed immediately, but when prefetching requests are sent in batches,
|
||||
* this can be smaller than pageserver_requests_sent_total.
|
||||
*/
|
||||
uint64 pageserver_send_flushes_total;
|
||||
|
||||
/*
|
||||
* Number of requests satisfied from the LFC.
|
||||
*
|
||||
* This is redundant with the server-wide file_cache_hits, but this gives
|
||||
* per-backend granularity, and it's handy to have this in the same place
|
||||
* as counters for requests that went to the pageserver. Maybe move all
|
||||
* the LFC stats to this struct in the future?
|
||||
*/
|
||||
uint64 file_cache_hits_total;
|
||||
|
||||
} neon_per_backend_counters;
|
||||
|
||||
/* Pointer to the shared memory array of neon_per_backend_counters structs */
|
||||
extern neon_per_backend_counters *neon_per_backend_counters_shared;
|
||||
|
||||
#if PG_VERSION_NUM >= 170000
|
||||
#define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
|
||||
#else
|
||||
#define MyNeonCounters (&neon_per_backend_counters_shared[MyProc->pgprocno])
|
||||
#endif
|
||||
|
||||
extern void inc_getpage_wait(uint64 latency);
|
||||
|
||||
extern Size NeonPerfCountersShmemSize(void);
|
||||
extern void NeonPerfCountersShmemInit(void);
|
||||
|
||||
|
||||
#endif /* NEON_PERF_COUNTERS_H */
|
||||
44
pgxn/neon/neon_pgversioncompat.c
Normal file
44
pgxn/neon/neon_pgversioncompat.c
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Support functions for the compatibility macros in neon_pgversioncompat.h
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/tuplestore.h"
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
void
|
||||
InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
|
||||
{
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
Tuplestorestate *tupstore;
|
||||
MemoryContext old_context,
|
||||
per_query_ctx;
|
||||
TupleDesc stored_tupdesc;
|
||||
|
||||
/* check to see if caller supports returning a tuplestore */
|
||||
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("set-valued function called in context that cannot accept a set")));
|
||||
|
||||
/*
|
||||
* Store the tuplestore and the tuple descriptor in ReturnSetInfo. This
|
||||
* must be done in the per-query memory context.
|
||||
*/
|
||||
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
|
||||
old_context = MemoryContextSwitchTo(per_query_ctx);
|
||||
|
||||
if (get_call_result_type(fcinfo, NULL, &stored_tupdesc) != TYPEFUNC_COMPOSITE)
|
||||
elog(ERROR, "return type must be a row type");
|
||||
|
||||
tupstore = tuplestore_begin_heap(false, false, work_mem);
|
||||
rsinfo->returnMode = SFRM_Materialize;
|
||||
rsinfo->setResult = tupstore;
|
||||
rsinfo->setDesc = stored_tupdesc;
|
||||
MemoryContextSwitchTo(old_context);
|
||||
}
|
||||
#endif
|
||||
@@ -6,6 +6,8 @@
|
||||
#ifndef NEON_PGVERSIONCOMPAT_H
|
||||
#define NEON_PGVERSIONCOMPAT_H
|
||||
|
||||
#include "fmgr.h"
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 17
|
||||
#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
|
||||
#else
|
||||
@@ -123,4 +125,8 @@
|
||||
#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
|
||||
#endif
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
|
||||
#endif
|
||||
|
||||
#endif /* NEON_PGVERSIONCOMPAT_H */
|
||||
|
||||
@@ -66,6 +66,7 @@
|
||||
#include "storage/md.h"
|
||||
#include "storage/smgr.h"
|
||||
|
||||
#include "neon_perf_counters.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "bitmap.h"
|
||||
|
||||
@@ -289,7 +290,6 @@ static PrefetchState *MyPState;
|
||||
|
||||
static bool compact_prefetch_buffers(void);
|
||||
static void consume_prefetch_responses(void);
|
||||
static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
|
||||
static bool prefetch_read(PrefetchRequest *slot);
|
||||
static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
|
||||
static bool prefetch_wait_for(uint64 ring_index);
|
||||
@@ -780,21 +780,27 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
}
|
||||
|
||||
/*
|
||||
* prefetch_register_buffer() - register and prefetch buffer
|
||||
* prefetch_register_bufferv() - register and prefetch buffers
|
||||
*
|
||||
* Register that we may want the contents of BufferTag in the near future.
|
||||
* This is used when issuing a speculative prefetch request, but also when
|
||||
* performing a synchronous request and need the buffer right now.
|
||||
*
|
||||
* If force_request_lsns is not NULL, those values are sent to the
|
||||
* pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
|
||||
* to calculate the LSNs to send.
|
||||
*
|
||||
* When performing a prefetch rather than a synchronous request,
|
||||
* is_prefetch==true. Currently, it only affects how the request is accounted
|
||||
* in the perf counters.
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*/
|
||||
|
||||
static uint64
|
||||
prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
BlockNumber nblocks, const bits8 *mask)
|
||||
BlockNumber nblocks, const bits8 *mask,
|
||||
bool is_prefetch)
|
||||
{
|
||||
uint64 min_ring_index;
|
||||
PrefetchRequest req;
|
||||
@@ -815,6 +821,7 @@ Retry:
|
||||
PrfHashEntry *entry = NULL;
|
||||
uint64 ring_index;
|
||||
neon_request_lsns *lsns;
|
||||
|
||||
if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
|
||||
continue;
|
||||
|
||||
@@ -858,6 +865,7 @@ Retry:
|
||||
prefetch_set_unused(ring_index);
|
||||
entry = NULL;
|
||||
slot = NULL;
|
||||
MyNeonCounters->getpage_prefetch_discards_total++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -972,6 +980,11 @@ Retry:
|
||||
|
||||
min_ring_index = Min(min_ring_index, ring_index);
|
||||
|
||||
if (is_prefetch)
|
||||
MyNeonCounters->getpage_prefetch_requests_total++;
|
||||
else
|
||||
MyNeonCounters->getpage_sync_requests_total++;
|
||||
|
||||
prefetch_do_request(slot, lsns);
|
||||
}
|
||||
|
||||
@@ -1000,13 +1013,6 @@ Retry:
|
||||
}
|
||||
|
||||
|
||||
static uint64
|
||||
prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
|
||||
{
|
||||
return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Note: this function can get canceled and use a long jump to the next catch
|
||||
* context. Take care.
|
||||
@@ -2612,7 +2618,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
lfc_present[i] = ~(lfc_present[i]);
|
||||
|
||||
ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
|
||||
lfc_present);
|
||||
lfc_present, true);
|
||||
nblocks -= iterblocks;
|
||||
blocknum += iterblocks;
|
||||
|
||||
@@ -2656,7 +2662,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
|
||||
|
||||
ring_index = prefetch_register_buffer(tag, NULL);
|
||||
ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true);
|
||||
|
||||
Assert(ring_index < MyPState->ring_unused &&
|
||||
MyPState->ring_last <= ring_index);
|
||||
@@ -2747,17 +2753,20 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
|
||||
* weren't for the behaviour of the LwLsn cache that uses the highest
|
||||
* value of the LwLsn cache when the entry is not found.
|
||||
*/
|
||||
prefetch_register_bufferv(buftag, request_lsns, nblocks, mask);
|
||||
prefetch_register_bufferv(buftag, request_lsns, nblocks, mask, false);
|
||||
|
||||
for (int i = 0; i < nblocks; i++)
|
||||
{
|
||||
void *buffer = buffers[i];
|
||||
BlockNumber blockno = base_blockno + i;
|
||||
neon_request_lsns *reqlsns = &request_lsns[i];
|
||||
TimestampTz start_ts, end_ts;
|
||||
|
||||
if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
|
||||
continue;
|
||||
|
||||
start_ts = GetCurrentTimestamp();
|
||||
|
||||
if (RecoveryInProgress() && MyBackendType != B_STARTUP)
|
||||
XLogWaitForReplayOf(reqlsns[0].request_lsn);
|
||||
|
||||
@@ -2794,6 +2803,7 @@ Retry:
|
||||
/* drop caches */
|
||||
prefetch_set_unused(slot->my_ring_index);
|
||||
pgBufferUsage.prefetch.expired += 1;
|
||||
MyNeonCounters->getpage_prefetch_discards_total++;
|
||||
/* make it look like a prefetch cache miss */
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -2804,8 +2814,9 @@ Retry:
|
||||
if (entry == NULL)
|
||||
{
|
||||
pgBufferUsage.prefetch.misses += 1;
|
||||
MyNeonCounters->getpage_prefetch_misses_total++;
|
||||
|
||||
ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL);
|
||||
ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL, false);
|
||||
Assert(ring_index != UINT64_MAX);
|
||||
slot = GetPrfSlot(ring_index);
|
||||
}
|
||||
@@ -2860,6 +2871,9 @@ Retry:
|
||||
/* buffer was used, clean up for later reuse */
|
||||
prefetch_set_unused(ring_index);
|
||||
prefetch_cleanup_trailing_unused();
|
||||
|
||||
end_ts = GetCurrentTimestamp();
|
||||
inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2913,6 +2927,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
/* Try to read from local file cache */
|
||||
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
|
||||
{
|
||||
MyNeonCounters->file_cache_hits_total++;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3097,7 +3112,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
/* assume heap */
|
||||
RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
|
||||
RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
|
||||
|
||||
|
||||
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
|
||||
{
|
||||
neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
* PushPage ('P'): Copy a page image (in the payload) to buffer cache
|
||||
* ApplyRecord ('A'): Apply a WAL record (in the payload)
|
||||
* GetPage ('G'): Return a page image from buffer cache.
|
||||
* Ping ('H'): Return the input message.
|
||||
*
|
||||
* Currently, you only get a response to GetPage requests; the response is
|
||||
* simply a 8k page, without any headers. Errors are logged to stderr.
|
||||
@@ -133,6 +134,7 @@ static void ApplyRecord(StringInfo input_message);
|
||||
static void apply_error_callback(void *arg);
|
||||
static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
|
||||
static void GetPage(StringInfo input_message);
|
||||
static void Ping(StringInfo input_message);
|
||||
static ssize_t buffered_read(void *buf, size_t count);
|
||||
static void CreateFakeSharedMemoryAndSemaphores();
|
||||
|
||||
@@ -394,6 +396,10 @@ WalRedoMain(int argc, char *argv[])
|
||||
GetPage(&input_message);
|
||||
break;
|
||||
|
||||
case 'H': /* Ping */
|
||||
Ping(&input_message);
|
||||
break;
|
||||
|
||||
/*
|
||||
* EOF means we're done. Perform normal shutdown.
|
||||
*/
|
||||
@@ -1057,6 +1063,36 @@ GetPage(StringInfo input_message)
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
Ping(StringInfo input_message)
|
||||
{
|
||||
int tot_written;
|
||||
/* Response: the input message */
|
||||
tot_written = 0;
|
||||
do {
|
||||
ssize_t rc;
|
||||
/* We don't need alignment, but it's bad practice to use char[BLCKSZ] */
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
static const PGIOAlignedBlock response;
|
||||
#else
|
||||
static const PGAlignedBlock response;
|
||||
#endif
|
||||
rc = write(STDOUT_FILENO, &response.data[tot_written], BLCKSZ - tot_written);
|
||||
if (rc < 0) {
|
||||
/* If interrupted by signal, just retry */
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to stdout: %m")));
|
||||
}
|
||||
tot_written += rc;
|
||||
} while (tot_written < BLCKSZ);
|
||||
|
||||
elog(TRACE, "Page sent back for ping");
|
||||
}
|
||||
|
||||
|
||||
/* Buffer used by buffered_read() */
|
||||
static char stdin_buf[16 * 1024];
|
||||
static size_t stdin_len = 0; /* # of bytes in buffer */
|
||||
|
||||
@@ -29,7 +29,6 @@ dashmap.workspace = true
|
||||
env_logger.workspace = true
|
||||
framed-websockets.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
hashlink.workspace = true
|
||||
hex.workspace = true
|
||||
|
||||
@@ -444,7 +444,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
Self::Web(url, ()) => {
|
||||
info!("performing web authentication");
|
||||
|
||||
let info = web::authenticate(ctx, &url, client).await?;
|
||||
let info = web::authenticate(ctx, config, &url, client).await?;
|
||||
|
||||
Backend::Web(url, info)
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::{
|
||||
auth, compute,
|
||||
config::AuthenticationConfig,
|
||||
console::{self, provider::NodeInfo},
|
||||
context::RequestMonitoring,
|
||||
error::{ReportableError, UserFacingError},
|
||||
@@ -58,6 +59,7 @@ pub(crate) fn new_psql_session_id() -> String {
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
ctx: &RequestMonitoring,
|
||||
auth_config: &'static AuthenticationConfig,
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<NodeInfo> {
|
||||
@@ -89,6 +91,14 @@ pub(super) async fn authenticate(
|
||||
info!(parent: &span, "waiting for console's reply...");
|
||||
let db_info = waiter.await.map_err(WebAuthError::from)?;
|
||||
|
||||
if auth_config.ip_allowlist_check_enabled {
|
||||
if let Some(allowed_ips) = &db_info.allowed_ips {
|
||||
if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
|
||||
|
||||
// This config should be self-contained, because we won't
|
||||
|
||||
@@ -284,6 +284,8 @@ pub(crate) struct DatabaseInfo {
|
||||
/// be inconvenient for debug with local PG instance.
|
||||
pub(crate) password: Option<Box<str>>,
|
||||
pub(crate) aux: MetricsAuxInfo,
|
||||
#[serde(default)]
|
||||
pub(crate) allowed_ips: Option<Vec<IpPattern>>,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
@@ -294,6 +296,7 @@ impl fmt::Debug for DatabaseInfo {
|
||||
.field("port", &self.port)
|
||||
.field("dbname", &self.dbname)
|
||||
.field("user", &self.user)
|
||||
.field("allowed_ips", &self.allowed_ips)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
@@ -432,6 +435,22 @@ mod tests {
|
||||
"aux": dummy_aux(),
|
||||
}))?;
|
||||
|
||||
// with allowed_ips
|
||||
let dbinfo = serde_json::from_value::<DatabaseInfo>(json!({
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"password": "password",
|
||||
"aux": dummy_aux(),
|
||||
"allowed_ips": ["127.0.0.1"],
|
||||
}))?;
|
||||
|
||||
assert_eq!(
|
||||
dbinfo.allowed_ips,
|
||||
Some(vec![IpPattern::Single("127.0.0.1".parse()?)])
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ chrono.workspace = true
|
||||
clap = { workspace = true, features = ["derive"] }
|
||||
crc32c.workspace = true
|
||||
fail.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
|
||||
@@ -15,7 +15,6 @@ const_format.workspace = true
|
||||
futures.workspace = true
|
||||
futures-core.workspace = true
|
||||
futures-util.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -20,7 +20,6 @@ chrono.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
|
||||
@@ -71,6 +71,37 @@ impl ComputeHookTenant {
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sharded(&self) -> bool {
|
||||
matches!(self, ComputeHookTenant::Sharded(_))
|
||||
}
|
||||
|
||||
/// Clear compute hook state for the specified shard.
|
||||
/// Only valid for [`ComputeHookTenant::Sharded`] instances.
|
||||
fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) {
|
||||
match self {
|
||||
ComputeHookTenant::Sharded(sharded) => {
|
||||
if sharded.stripe_size != stripe_size
|
||||
|| sharded.shard_count != tenant_shard_id.shard_count
|
||||
{
|
||||
tracing::warn!("Shard split detected while handling detach")
|
||||
}
|
||||
|
||||
let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| {
|
||||
*shard_number == tenant_shard_id.shard_number
|
||||
});
|
||||
|
||||
if let Some(shard_idx) = shard_idx {
|
||||
sharded.shards.remove(shard_idx);
|
||||
} else {
|
||||
tracing::warn!("Shard not found while handling detach")
|
||||
}
|
||||
}
|
||||
ComputeHookTenant::Unsharded(_) => {
|
||||
unreachable!("Detach of unsharded tenants is handled externally");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Set one shard's location. If stripe size or shard count have changed, Self is reset
|
||||
/// and drops existing content.
|
||||
fn update(
|
||||
@@ -614,6 +645,36 @@ impl ComputeHook {
|
||||
self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Reflect a detach for a particular shard in the compute hook state.
|
||||
///
|
||||
/// The goal is to avoid sending compute notifications with stale information (i.e.
|
||||
/// including detach pageservers).
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(super) fn handle_detach(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
stripe_size: ShardStripeSize,
|
||||
) {
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
let mut state_locked = self.state.lock().unwrap();
|
||||
match state_locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(_) => {
|
||||
tracing::warn!("Compute hook tenant not found for detach");
|
||||
}
|
||||
Entry::Occupied(mut e) => {
|
||||
let sharded = e.get().is_sharded();
|
||||
if !sharded {
|
||||
e.remove();
|
||||
} else {
|
||||
e.get_mut().remove_shard(tenant_shard_id, stripe_size);
|
||||
}
|
||||
|
||||
tracing::debug!("Compute hook handled shard detach");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -515,7 +515,7 @@ async fn handle_tenant_timeline_passthrough(
|
||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||
|
||||
// Find the node that holds shard zero
|
||||
let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
|
||||
let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?;
|
||||
|
||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||
// rewrite this to a shard-aware shard zero ID.
|
||||
@@ -545,10 +545,10 @@ async fn handle_tenant_timeline_passthrough(
|
||||
let _timer = latency.start_timer(labels.clone());
|
||||
|
||||
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
|
||||
let resp = client.get_raw(path).await.map_err(|_e|
|
||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||
ApiError::ShuttingDown)?;
|
||||
let resp = client.get_raw(path).await.map_err(|e|
|
||||
// We return 503 here because if we can't successfully send a request to the pageserver,
|
||||
// either we aren't available or the pageserver is unavailable.
|
||||
ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let error_counter = &METRICS_REGISTRY
|
||||
@@ -557,6 +557,19 @@ async fn handle_tenant_timeline_passthrough(
|
||||
error_counter.inc(labels);
|
||||
}
|
||||
|
||||
// Transform 404 into 503 if we raced with a migration
|
||||
if resp.status() == reqwest::StatusCode::NOT_FOUND {
|
||||
// Look up node again: if we migrated it will be different
|
||||
let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?;
|
||||
if new_node.get_id() != node.get_id() {
|
||||
// Rather than retry here, send the client a 503 to prompt a retry: this matches
|
||||
// the pageserver's use of 503, and all clients calling this API should retry on 503.
|
||||
return Err(ApiError::ResourceUnavailable(
|
||||
format!("Pageserver {node} returned 404, was migrated to {new_node}").into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// We have a reqest::Response, would like a http::Response
|
||||
let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
|
||||
for (k, v) in resp.headers() {
|
||||
@@ -1849,7 +1862,7 @@ pub fn make_router(
|
||||
RequestName("v1_tenant_timeline"),
|
||||
)
|
||||
})
|
||||
.post(
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
|
||||
@@ -2,8 +2,8 @@ use std::{str::FromStr, time::Duration};
|
||||
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantLocateResponseShard,
|
||||
AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest,
|
||||
NodeSchedulingPolicy, TenantLocateResponseShard,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -36,7 +36,7 @@ pub(crate) struct Node {
|
||||
listen_pg_addr: String,
|
||||
listen_pg_port: u16,
|
||||
|
||||
availability_zone_id: String,
|
||||
availability_zone_id: AvailabilityZone,
|
||||
|
||||
// This cancellation token means "stop any RPCs in flight to this node, and don't start
|
||||
// any more". It is not related to process shutdown.
|
||||
@@ -64,8 +64,8 @@ impl Node {
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub(crate) fn get_availability_zone_id(&self) -> &str {
|
||||
self.availability_zone_id.as_str()
|
||||
pub(crate) fn get_availability_zone_id(&self) -> &AvailabilityZone {
|
||||
&self.availability_zone_id
|
||||
}
|
||||
|
||||
pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
|
||||
@@ -181,7 +181,7 @@ impl Node {
|
||||
listen_http_port: u16,
|
||||
listen_pg_addr: String,
|
||||
listen_pg_port: u16,
|
||||
availability_zone_id: String,
|
||||
availability_zone_id: AvailabilityZone,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
@@ -204,7 +204,7 @@ impl Node {
|
||||
listen_http_port: self.listen_http_port as i32,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
listen_pg_port: self.listen_pg_port as i32,
|
||||
availability_zone_id: self.availability_zone_id.clone(),
|
||||
availability_zone_id: self.availability_zone_id.0.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -219,7 +219,7 @@ impl Node {
|
||||
listen_http_port: np.listen_http_port as u16,
|
||||
listen_pg_addr: np.listen_pg_addr,
|
||||
listen_pg_port: np.listen_pg_port as u16,
|
||||
availability_zone_id: np.availability_zone_id,
|
||||
availability_zone_id: AvailabilityZone(np.availability_zone_id),
|
||||
cancel: CancellationToken::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,7 +238,7 @@ impl PageserverClient {
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"timeline_archival_config",
|
||||
crate::metrics::Method::Post,
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.timeline_archival_config(tenant_shard_id, timeline_id, req)
|
||||
|
||||
@@ -9,6 +9,7 @@ use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::controller_api::AvailabilityZone;
|
||||
use pageserver_api::controller_api::MetadataHealthRecord;
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
@@ -667,8 +668,8 @@ impl Persistence {
|
||||
|
||||
pub(crate) async fn set_tenant_shard_preferred_azs(
|
||||
&self,
|
||||
preferred_azs: Vec<(TenantShardId, String)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, String)>> {
|
||||
preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
|
||||
@@ -679,7 +680,7 @@ impl Persistence {
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set(preferred_az_id.eq(preferred_az))
|
||||
.set(preferred_az_id.eq(preferred_az.0.clone()))
|
||||
.execute(conn)?;
|
||||
|
||||
if updated == 1 {
|
||||
|
||||
@@ -463,7 +463,7 @@ impl Reconciler {
|
||||
for (timeline_id, baseline_lsn) in &baseline {
|
||||
match latest.get(timeline_id) {
|
||||
Some(latest_lsn) => {
|
||||
tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
tracing::info!(timeline_id = %timeline_id, "🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
if latest_lsn < baseline_lsn {
|
||||
any_behind = true;
|
||||
}
|
||||
@@ -541,6 +541,8 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
pausable_failpoint!("reconciler-live-migrate-pre-generation-inc");
|
||||
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = Some(
|
||||
self.persistence
|
||||
@@ -617,6 +619,8 @@ impl Reconciler {
|
||||
},
|
||||
);
|
||||
|
||||
pausable_failpoint!("reconciler-live-migrate-post-detach");
|
||||
|
||||
tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",);
|
||||
let dest_final_conf = build_location_config(
|
||||
&self.shard,
|
||||
@@ -820,6 +824,16 @@ impl Reconciler {
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
}
|
||||
|
||||
// The condition below identifies a detach. We must have no attached intent and
|
||||
// must have been attached to something previously. Pass this information to
|
||||
// the [`ComputeHook`] such that it can update its tenant-wide state.
|
||||
if self.intent.attached.is_none() && !self.detach.is_empty() {
|
||||
// TODO: Consider notifying control plane about detaches. This would avoid situations
|
||||
// where the compute tries to start-up with a stale set of pageservers.
|
||||
self.compute_hook
|
||||
.handle_detach(self.tenant_shard_id, self.shard.stripe_size);
|
||||
}
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crate::{node::Node, tenant_shard::TenantShard};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::models::PageserverUtilization;
|
||||
use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use std::{collections::HashMap, fmt::Debug};
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
||||
@@ -27,17 +27,230 @@ pub enum MaySchedule {
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SchedulerNode {
|
||||
pub(crate) struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
shard_count: usize,
|
||||
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
attached_shard_count: usize,
|
||||
/// Availability zone id in which the node resides
|
||||
az: AvailabilityZone,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
may_schedule: MaySchedule,
|
||||
}
|
||||
|
||||
pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
|
||||
fn generate(
|
||||
node_id: &NodeId,
|
||||
node: &mut SchedulerNode,
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Option<Self>;
|
||||
fn is_overloaded(&self) -> bool;
|
||||
fn node_id(&self) -> NodeId;
|
||||
}
|
||||
|
||||
pub(crate) trait ShardTag {
|
||||
type Score: NodeSchedulingScore;
|
||||
}
|
||||
|
||||
pub(crate) struct AttachedShardTag {}
|
||||
impl ShardTag for AttachedShardTag {
|
||||
type Score = NodeAttachmentSchedulingScore;
|
||||
}
|
||||
|
||||
pub(crate) struct SecondaryShardTag {}
|
||||
impl ShardTag for SecondaryShardTag {
|
||||
type Score = NodeSecondarySchedulingScore;
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
||||
enum AzMatch {
|
||||
Yes,
|
||||
No,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl AzMatch {
|
||||
fn new(node_az: &AvailabilityZone, shard_preferred_az: Option<&AvailabilityZone>) -> Self {
|
||||
match shard_preferred_az {
|
||||
Some(preferred_az) if preferred_az == node_az => Self::Yes,
|
||||
Some(_preferred_az) => Self::No,
|
||||
None => Self::Unknown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
||||
struct AttachmentAzMatch(AzMatch);
|
||||
|
||||
impl Ord for AttachmentAzMatch {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// Lower scores indicate a more suitable node.
|
||||
// Note that we prefer a node for which we don't have
|
||||
// info to a node which we are certain doesn't match the
|
||||
// preferred AZ of the shard.
|
||||
let az_match_score = |az_match: &AzMatch| match az_match {
|
||||
AzMatch::Yes => 0,
|
||||
AzMatch::Unknown => 1,
|
||||
AzMatch::No => 2,
|
||||
};
|
||||
|
||||
az_match_score(&self.0).cmp(&az_match_score(&other.0))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for AttachmentAzMatch {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
||||
struct SecondaryAzMatch(AzMatch);
|
||||
|
||||
impl Ord for SecondaryAzMatch {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// Lower scores indicate a more suitable node.
|
||||
// For secondary locations we wish to avoid the preferred AZ
|
||||
// of the shard.
|
||||
let az_match_score = |az_match: &AzMatch| match az_match {
|
||||
AzMatch::No => 0,
|
||||
AzMatch::Unknown => 1,
|
||||
AzMatch::Yes => 2,
|
||||
};
|
||||
|
||||
az_match_score(&self.0).cmp(&az_match_score(&other.0))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for SecondaryAzMatch {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// Scheduling score of a given node for shard attachments.
|
||||
/// Lower scores indicate more suitable nodes.
|
||||
/// Ordering is given by member declaration order (top to bottom).
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub(crate) struct NodeAttachmentSchedulingScore {
|
||||
/// The number of shards belonging to the tenant currently being
|
||||
/// scheduled that are attached to this node.
|
||||
affinity_score: AffinityScore,
|
||||
/// Flag indicating whether this node matches the preferred AZ
|
||||
/// of the shard. For equal affinity scores, nodes in the matching AZ
|
||||
/// are considered first.
|
||||
az_match: AttachmentAzMatch,
|
||||
/// Size of [`ScheduleContext::attached_nodes`] for the current node.
|
||||
/// This normally tracks the number of attached shards belonging to the
|
||||
/// tenant being scheduled that are already on this node.
|
||||
attached_shards_in_context: usize,
|
||||
/// Utilisation score that combines shard count and disk utilisation
|
||||
utilization_score: u64,
|
||||
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
||||
/// acts as an anti-affinity between attached shards.
|
||||
total_attached_shard_count: usize,
|
||||
/// Convenience to make selection deterministic in tests and empty systems
|
||||
node_id: NodeId,
|
||||
}
|
||||
|
||||
impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
|
||||
fn generate(
|
||||
node_id: &NodeId,
|
||||
node: &mut SchedulerNode,
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Option<Self> {
|
||||
let utilization = match &mut node.may_schedule {
|
||||
MaySchedule::Yes(u) => u,
|
||||
MaySchedule::No => {
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
Some(Self {
|
||||
affinity_score: context
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE),
|
||||
az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
|
||||
attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
|
||||
utilization_score: utilization.cached_score(),
|
||||
total_attached_shard_count: node.attached_shard_count,
|
||||
node_id: *node_id,
|
||||
})
|
||||
}
|
||||
|
||||
fn is_overloaded(&self) -> bool {
|
||||
PageserverUtilization::is_overloaded(self.utilization_score)
|
||||
}
|
||||
|
||||
fn node_id(&self) -> NodeId {
|
||||
self.node_id
|
||||
}
|
||||
}
|
||||
|
||||
/// Scheduling score of a given node for shard secondaries.
|
||||
/// Lower scores indicate more suitable nodes.
|
||||
/// Ordering is given by member declaration order (top to bottom).
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub(crate) struct NodeSecondarySchedulingScore {
|
||||
/// Flag indicating whether this node matches the preferred AZ
|
||||
/// of the shard. For secondary locations we wish to avoid nodes in.
|
||||
/// the preferred AZ of the shard, since that's where the attached location
|
||||
/// should be scheduled and having the secondary in the same AZ is bad for HA.
|
||||
az_match: SecondaryAzMatch,
|
||||
/// The number of shards belonging to the tenant currently being
|
||||
/// scheduled that are attached to this node.
|
||||
affinity_score: AffinityScore,
|
||||
/// Utilisation score that combines shard count and disk utilisation
|
||||
utilization_score: u64,
|
||||
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
||||
/// acts as an anti-affinity between attached shards.
|
||||
total_attached_shard_count: usize,
|
||||
/// Convenience to make selection deterministic in tests and empty systems
|
||||
node_id: NodeId,
|
||||
}
|
||||
|
||||
impl NodeSchedulingScore for NodeSecondarySchedulingScore {
|
||||
fn generate(
|
||||
node_id: &NodeId,
|
||||
node: &mut SchedulerNode,
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Option<Self> {
|
||||
let utilization = match &mut node.may_schedule {
|
||||
MaySchedule::Yes(u) => u,
|
||||
MaySchedule::No => {
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
Some(Self {
|
||||
az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
|
||||
affinity_score: context
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE),
|
||||
utilization_score: utilization.cached_score(),
|
||||
total_attached_shard_count: node.attached_shard_count,
|
||||
node_id: *node_id,
|
||||
})
|
||||
}
|
||||
|
||||
fn is_overloaded(&self) -> bool {
|
||||
PageserverUtilization::is_overloaded(self.utilization_score)
|
||||
}
|
||||
|
||||
fn node_id(&self) -> NodeId {
|
||||
self.node_id
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for SchedulerNode {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
let may_schedule_matches = matches!(
|
||||
@@ -48,6 +261,7 @@ impl PartialEq for SchedulerNode {
|
||||
may_schedule_matches
|
||||
&& self.shard_count == other.shard_count
|
||||
&& self.attached_shard_count == other.attached_shard_count
|
||||
&& self.az == other.az
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,6 +376,7 @@ impl Scheduler {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -188,6 +403,7 @@ impl Scheduler {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -366,6 +582,7 @@ impl Scheduler {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -406,6 +623,29 @@ impl Scheduler {
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
/// Compute a schedulling score for each node that the scheduler knows of
|
||||
/// minus a set of hard excluded nodes.
|
||||
fn compute_node_scores<Score>(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Vec<Score>
|
||||
where
|
||||
Score: NodeSchedulingScore,
|
||||
{
|
||||
self.nodes
|
||||
.iter_mut()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) {
|
||||
None
|
||||
} else {
|
||||
Score::generate(k, v, preferred_az, context)
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
|
||||
/// are already in use by this shard -- we use this to avoid picking the same node
|
||||
/// as both attached and secondary location. This is a hard constraint: if we cannot
|
||||
@@ -415,29 +655,18 @@ impl Scheduler {
|
||||
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
|
||||
/// the same tenant on the same node. This is a soft constraint: the context will never
|
||||
/// cause us to fail to schedule a shard.
|
||||
pub(crate) fn schedule_shard(
|
||||
pub(crate) fn schedule_shard<Tag: ShardTag>(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
|
||||
.nodes
|
||||
.iter_mut()
|
||||
.filter_map(|(k, v)| match &mut v.may_schedule {
|
||||
MaySchedule::No => None,
|
||||
MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
|
||||
MaySchedule::Yes(utilization) => Some((
|
||||
*k,
|
||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
||||
utilization.cached_score(),
|
||||
v.attached_shard_count,
|
||||
)),
|
||||
})
|
||||
.collect();
|
||||
let mut scores =
|
||||
self.compute_node_scores::<Tag::Score>(hard_exclude, preferred_az, context);
|
||||
|
||||
// Exclude nodes whose utilization is critically high, if there are alternatives available. This will
|
||||
// cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
|
||||
@@ -445,20 +674,18 @@ impl Scheduler {
|
||||
// overloaded.
|
||||
let non_overloaded_scores = scores
|
||||
.iter()
|
||||
.filter(|i| !PageserverUtilization::is_overloaded(i.2))
|
||||
.filter(|i| !i.is_overloaded())
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
if !non_overloaded_scores.is_empty() {
|
||||
scores = non_overloaded_scores;
|
||||
}
|
||||
|
||||
// Sort by, in order of precedence:
|
||||
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
|
||||
// 2nd: Utilization score (this combines shard count and disk utilization)
|
||||
// 3rd: Attached shard count. When nodes have identical utilization (e.g. when populating some
|
||||
// empty nodes), this acts as an anti-affinity between attached shards.
|
||||
// 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
|
||||
// Sort the nodes by score. The one with the lowest scores will be the preferred node.
|
||||
// Refer to [`NodeAttachmentSchedulingScore`] for attached locations and
|
||||
// [`NodeSecondarySchedulingScore`] for secondary locations to understand how the nodes
|
||||
// are ranked.
|
||||
scores.sort();
|
||||
|
||||
if scores.is_empty() {
|
||||
// After applying constraints, no pageservers were left.
|
||||
@@ -481,12 +708,12 @@ impl Scheduler {
|
||||
}
|
||||
|
||||
// Lowest score wins
|
||||
let node_id = scores.first().unwrap().0;
|
||||
let node_id = scores.first().unwrap().node_id();
|
||||
|
||||
if !matches!(context.mode, ScheduleMode::Speculative) {
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
||||
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -496,6 +723,12 @@ impl Scheduler {
|
||||
Ok(node_id)
|
||||
}
|
||||
|
||||
/// Selects any available node. This is suitable for performing background work (e.g. S3
|
||||
/// deletions).
|
||||
pub(crate) fn any_available_node(&mut self) -> Result<NodeId, ScheduleError> {
|
||||
self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
|
||||
}
|
||||
|
||||
/// Unit test access to internal state
|
||||
#[cfg(test)]
|
||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
||||
@@ -512,13 +745,22 @@ impl Scheduler {
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
|
||||
use pageserver_api::{
|
||||
controller_api::{AvailabilityZone, NodeAvailability},
|
||||
models::utilization::test_utilization,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
///
|
||||
/// Node IDs start at one.
|
||||
pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
|
||||
///
|
||||
/// The `azs` argument specifies the list of availability zones which will be assigned
|
||||
/// to nodes in round-robin fashion. If empy, a default AZ is assigned.
|
||||
pub(crate) fn make_test_nodes(n: u64, azs: &[AvailabilityZone]) -> HashMap<NodeId, Node> {
|
||||
let mut az_iter = azs.iter().cycle();
|
||||
|
||||
(1..n + 1)
|
||||
.map(|i| {
|
||||
(NodeId(i), {
|
||||
@@ -528,7 +770,10 @@ pub(crate) mod test_utils {
|
||||
80 + i as u16,
|
||||
format!("pghost-{i}"),
|
||||
5432 + i as u16,
|
||||
"test-az".to_string(),
|
||||
az_iter
|
||||
.next()
|
||||
.cloned()
|
||||
.unwrap_or(AvailabilityZone("test-az".to_string())),
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
|
||||
assert!(node.is_available());
|
||||
@@ -548,7 +793,7 @@ mod tests {
|
||||
use crate::tenant_shard::IntentState;
|
||||
#[test]
|
||||
fn scheduler_basic() -> anyhow::Result<()> {
|
||||
let nodes = test_utils::make_test_nodes(2);
|
||||
let nodes = test_utils::make_test_nodes(2, &[]);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut t1_intent = IntentState::new();
|
||||
@@ -556,9 +801,9 @@ mod tests {
|
||||
|
||||
let context = ScheduleContext::default();
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
|
||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
||||
@@ -567,7 +812,11 @@ mod tests {
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
|
||||
let scheduled = scheduler.schedule_shard::<AttachedShardTag>(
|
||||
&t1_intent.all_pageservers(),
|
||||
&None,
|
||||
&context,
|
||||
)?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
||||
@@ -607,7 +856,7 @@ mod tests {
|
||||
#[test]
|
||||
/// Test the PageserverUtilization's contribution to scheduling algorithm
|
||||
fn scheduler_utilization() {
|
||||
let mut nodes = test_utils::make_test_nodes(3);
|
||||
let mut nodes = test_utils::make_test_nodes(3, &[]);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// Need to keep these alive because they contribute to shard counts via RAII
|
||||
@@ -621,7 +870,9 @@ mod tests {
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) {
|
||||
let scheduled = scheduler.schedule_shard(&[], context).unwrap();
|
||||
let scheduled = scheduler
|
||||
.schedule_shard::<AttachedShardTag>(&[], &None, context)
|
||||
.unwrap();
|
||||
let mut intent = IntentState::new();
|
||||
intent.set_attached(scheduler, Some(scheduled));
|
||||
scheduled_intents.push(intent);
|
||||
@@ -729,4 +980,98 @@ mod tests {
|
||||
intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// A simple test that showcases AZ-aware scheduling and its interaction with
|
||||
/// affinity scores.
|
||||
fn az_scheduling() {
|
||||
let az_a_tag = AvailabilityZone("az-a".to_string());
|
||||
let az_b_tag = AvailabilityZone("az-b".to_string());
|
||||
|
||||
let nodes = test_utils::make_test_nodes(3, &[az_a_tag.clone(), az_b_tag.clone()]);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// Need to keep these alive because they contribute to shard counts via RAII
|
||||
let mut scheduled_intents = Vec::new();
|
||||
|
||||
let mut context = ScheduleContext::default();
|
||||
|
||||
fn assert_scheduler_chooses<Tag: ShardTag>(
|
||||
expect_node: NodeId,
|
||||
preferred_az: Option<AvailabilityZone>,
|
||||
scheduled_intents: &mut Vec<IntentState>,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &mut ScheduleContext,
|
||||
) {
|
||||
let scheduled = scheduler
|
||||
.schedule_shard::<Tag>(&[], &preferred_az, context)
|
||||
.unwrap();
|
||||
let mut intent = IntentState::new();
|
||||
intent.set_attached(scheduler, Some(scheduled));
|
||||
scheduled_intents.push(intent);
|
||||
assert_eq!(scheduled, expect_node);
|
||||
|
||||
context.avoid(&[scheduled]);
|
||||
}
|
||||
|
||||
assert_scheduler_chooses::<AttachedShardTag>(
|
||||
NodeId(1),
|
||||
Some(az_a_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Node 2 and 3 have affinity score equal to 0, but node 3
|
||||
// is in "az-a" so we prefer that.
|
||||
assert_scheduler_chooses::<AttachedShardTag>(
|
||||
NodeId(3),
|
||||
Some(az_a_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
|
||||
assert_scheduler_chooses::<AttachedShardTag>(
|
||||
NodeId(2),
|
||||
Some(az_a_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Avoid nodes in "az-a" for the secondary location.
|
||||
assert_scheduler_chooses::<SecondaryShardTag>(
|
||||
NodeId(2),
|
||||
Some(az_a_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Avoid nodes in "az-b" for the secondary location.
|
||||
// Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
|
||||
assert_scheduler_chooses::<SecondaryShardTag>(
|
||||
NodeId(1),
|
||||
Some(az_b_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Avoid nodes in "az-b" for the secondary location.
|
||||
// Node 3 has lower affinity score than 1, so prefer that.
|
||||
assert_scheduler_chooses::<SecondaryShardTag>(
|
||||
NodeId(3),
|
||||
Some(az_b_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
for mut intent in scheduled_intents {
|
||||
intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::{
|
||||
borrow::Cow,
|
||||
cmp::Ordering,
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
error::Error,
|
||||
ops::Deref,
|
||||
path::PathBuf,
|
||||
str::FromStr,
|
||||
@@ -218,9 +219,16 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
|
||||
format!("{node} error receiving error body: {str}").into(),
|
||||
)
|
||||
}
|
||||
mgmt_api::Error::ReceiveBody(str) => {
|
||||
// Presume errors receiving body are connectivity/availability issues
|
||||
ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
|
||||
mgmt_api::Error::ReceiveBody(err) if err.is_decode() => {
|
||||
// Return 500 for decoding errors.
|
||||
ApiError::InternalServerError(anyhow::Error::from(err).context("error decoding body"))
|
||||
}
|
||||
mgmt_api::Error::ReceiveBody(err) => {
|
||||
// Presume errors receiving body are connectivity/availability issues except for decoding errors
|
||||
let src_str = err.source().map(|e| e.to_string()).unwrap_or_default();
|
||||
ApiError::ResourceUnavailable(
|
||||
format!("{node} error receiving error body: {err} {}", src_str).into(),
|
||||
)
|
||||
}
|
||||
mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
|
||||
ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())
|
||||
@@ -1257,6 +1265,8 @@ impl Service {
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
use pageserver_api::controller_api::AvailabilityZone;
|
||||
|
||||
// Hack: insert scheduler state for all nodes referenced by shards, as compatibility
|
||||
// tests only store the shards, not the nodes. The nodes will be loaded shortly
|
||||
// after when pageservers start up and register.
|
||||
@@ -1274,7 +1284,7 @@ impl Service {
|
||||
123,
|
||||
"".to_string(),
|
||||
123,
|
||||
"test_az".to_string(),
|
||||
AvailabilityZone("test_az".to_string()),
|
||||
);
|
||||
|
||||
scheduler.node_upsert(&node);
|
||||
@@ -2091,7 +2101,7 @@ impl Service {
|
||||
let az_id = locked
|
||||
.nodes
|
||||
.get(&resp.node_id)
|
||||
.map(|n| n.get_availability_zone_id().to_string())?;
|
||||
.map(|n| n.get_availability_zone_id().clone())?;
|
||||
|
||||
Some((resp.shard_id, az_id))
|
||||
})
|
||||
@@ -2621,7 +2631,7 @@ impl Service {
|
||||
let scheduler = &mut locked.scheduler;
|
||||
// Right now we only perform the operation on a single node without parallelization
|
||||
// TODO fan out the operation to multiple nodes for better performance
|
||||
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
|
||||
let node_id = scheduler.any_available_node()?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
@@ -2807,7 +2817,7 @@ impl Service {
|
||||
|
||||
// Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
|
||||
// was attached, just has to be able to see the S3 content)
|
||||
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
|
||||
let node_id = scheduler.any_available_node()?;
|
||||
let node = nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while lock is active");
|
||||
@@ -3498,34 +3508,66 @@ impl Service {
|
||||
|
||||
/// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
|
||||
/// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
|
||||
pub(crate) fn tenant_shard0_node(
|
||||
pub(crate) async fn tenant_shard0_node(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(Node, TenantShardId), ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some((tenant_shard_id, shard)) = locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.next()
|
||||
// Look up in-memory state and maybe use the node from there.
|
||||
{
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some((tenant_shard_id, shard)) = locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.next()
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
let Some(intent_node_id) = shard.intent.get_attached() else {
|
||||
tracing::warn!(
|
||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Shard not scheduled (policy {:?}), cannot generate pass-through URL",
|
||||
shard.policy
|
||||
);
|
||||
return Err(ApiError::Conflict(
|
||||
"Cannot call timeline API on non-attached tenant".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
if shard.reconciler.is_none() {
|
||||
// Optimization: while no reconcile is in flight, we may trust our in-memory state
|
||||
// to tell us which pageserver to use. Otherwise we will fall through and hit the database
|
||||
let Some(node) = locked.nodes.get(intent_node_id) else {
|
||||
// This should never happen
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Shard refers to nonexistent node"
|
||||
)));
|
||||
};
|
||||
return Ok((node.clone(), *tenant_shard_id));
|
||||
}
|
||||
};
|
||||
|
||||
// Look up the latest attached pageserver location from the database
|
||||
// generation state: this will reflect the progress of any ongoing migration.
|
||||
// Note that it is not guaranteed to _stay_ here, our caller must still handle
|
||||
// the case where they call through to the pageserver and get a 404.
|
||||
let db_result = self.persistence.tenant_generations(tenant_id).await?;
|
||||
let Some(ShardGenerationState {
|
||||
tenant_shard_id,
|
||||
generation: _,
|
||||
generation_pageserver: Some(node_id),
|
||||
}) = db_result.first()
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
|
||||
// This can happen if we raced with a tenant deletion or a shard split. On a retry
|
||||
// the caller will either succeed (shard split case), get a proper 404 (deletion case),
|
||||
// or a conflict response (case where tenant was detached in background)
|
||||
return Err(ApiError::ResourceUnavailable(
|
||||
"Shard {} not found in database, or is not attached".into(),
|
||||
));
|
||||
};
|
||||
|
||||
// TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
|
||||
// point to somewhere we haven't attached yet.
|
||||
let Some(node_id) = shard.intent.get_attached() else {
|
||||
tracing::warn!(
|
||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Shard not scheduled (policy {:?}), cannot generate pass-through URL",
|
||||
shard.policy
|
||||
);
|
||||
return Err(ApiError::Conflict(
|
||||
"Cannot call timeline API on non-attached tenant".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some(node) = locked.nodes.get(node_id) else {
|
||||
// This should never happen
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
@@ -4471,7 +4513,7 @@ impl Service {
|
||||
let az_id = locked
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.get_availability_zone_id().to_string())?;
|
||||
.map(|n| n.get_availability_zone_id().clone())?;
|
||||
|
||||
Some((*tid, az_id))
|
||||
})
|
||||
|
||||
@@ -8,11 +8,14 @@ use crate::{
|
||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||
persistence::TenantShardPersistence,
|
||||
reconciler::{ReconcileUnits, ReconcilerConfig},
|
||||
scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
|
||||
scheduler::{
|
||||
AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext,
|
||||
SecondaryShardTag,
|
||||
},
|
||||
service::ReconcileResultRequest,
|
||||
};
|
||||
use pageserver_api::controller_api::{
|
||||
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
|
||||
AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
|
||||
};
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
@@ -143,7 +146,7 @@ pub(crate) struct TenantShard {
|
||||
|
||||
// We should attempt to schedule this shard in the provided AZ to
|
||||
// decrease chances of cross-AZ compute.
|
||||
preferred_az_id: Option<String>,
|
||||
preferred_az_id: Option<AvailabilityZone>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
@@ -335,19 +338,19 @@ pub(crate) enum ReconcileWaitError {
|
||||
Failed(TenantShardId, Arc<ReconcileError>),
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
#[derive(Eq, PartialEq, Debug, Clone)]
|
||||
pub(crate) struct ReplaceSecondary {
|
||||
old_node_id: NodeId,
|
||||
new_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
#[derive(Eq, PartialEq, Debug, Clone)]
|
||||
pub(crate) struct MigrateAttachment {
|
||||
pub(crate) old_attached_node_id: NodeId,
|
||||
pub(crate) new_attached_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
#[derive(Eq, PartialEq, Debug, Clone)]
|
||||
pub(crate) enum ScheduleOptimizationAction {
|
||||
// Replace one of our secondary locations with a different node
|
||||
ReplaceSecondary(ReplaceSecondary),
|
||||
@@ -355,7 +358,7 @@ pub(crate) enum ScheduleOptimizationAction {
|
||||
MigrateAttachment(MigrateAttachment),
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
#[derive(Eq, PartialEq, Debug, Clone)]
|
||||
pub(crate) struct ScheduleOptimization {
|
||||
// What was the reconcile sequence when we generated this optimization? The optimization
|
||||
// should only be applied if the shard's sequence is still at this value, in case other changes
|
||||
@@ -537,13 +540,22 @@ impl TenantShard {
|
||||
Ok((true, promote_secondary))
|
||||
} else {
|
||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
|
||||
let node_id = scheduler.schedule_shard::<AttachedShardTag>(
|
||||
&self.intent.secondary,
|
||||
&self.preferred_az_id,
|
||||
context,
|
||||
)?;
|
||||
tracing::debug!("Selected {} as attached", node_id);
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
Ok((true, node_id))
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(
|
||||
tenant_id=%self.tenant_shard_id.tenant_id,
|
||||
shard_id=%self.tenant_shard_id.shard_slug(),
|
||||
sequence=%self.sequence
|
||||
))]
|
||||
pub(crate) fn schedule(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
@@ -613,7 +625,11 @@ impl TenantShard {
|
||||
|
||||
let mut used_pageservers = vec![attached_node_id];
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
|
||||
let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
|
||||
&used_pageservers,
|
||||
&self.preferred_az_id,
|
||||
context,
|
||||
)?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
@@ -626,7 +642,11 @@ impl TenantShard {
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
let node_id = scheduler.schedule_shard(&[], context)?;
|
||||
let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
|
||||
&[],
|
||||
&self.preferred_az_id,
|
||||
context,
|
||||
)?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
modified = true;
|
||||
}
|
||||
@@ -803,9 +823,11 @@ impl TenantShard {
|
||||
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
|
||||
// This implicitly limits the choice to nodes that are available, and prefers nodes
|
||||
// with lower utilization.
|
||||
let Ok(candidate_node) =
|
||||
scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
|
||||
else {
|
||||
let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
|
||||
&self.intent.all_pageservers(),
|
||||
&self.preferred_az_id,
|
||||
schedule_context,
|
||||
) else {
|
||||
// A scheduling error means we have no possible candidate replacements
|
||||
continue;
|
||||
};
|
||||
@@ -1302,7 +1324,7 @@ impl TenantShard {
|
||||
pending_compute_notification: false,
|
||||
delayed_reconcile: false,
|
||||
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
|
||||
preferred_az_id: tsp.preferred_az_id,
|
||||
preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1318,25 +1340,28 @@ impl TenantShard {
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
|
||||
preferred_az_id: self.preferred_az_id.clone(),
|
||||
preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn preferred_az(&self) -> Option<&str> {
|
||||
self.preferred_az_id.as_deref()
|
||||
pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> {
|
||||
self.preferred_az_id.as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
|
||||
pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
|
||||
self.preferred_az_id = Some(preferred_az_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::{cell::RefCell, rc::Rc};
|
||||
|
||||
use pageserver_api::{
|
||||
controller_api::NodeAvailability,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
use rand::{rngs::StdRng, SeedableRng};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::scheduler::test_utils::make_test_nodes;
|
||||
@@ -1365,7 +1390,11 @@ pub(crate) mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
|
||||
fn make_test_tenant(
|
||||
policy: PlacementPolicy,
|
||||
shard_count: ShardCount,
|
||||
preferred_az: Option<AvailabilityZone>,
|
||||
) -> Vec<TenantShard> {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
(0..shard_count.count())
|
||||
@@ -1377,7 +1406,7 @@ pub(crate) mod tests {
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantShard::new(
|
||||
let mut ts = TenantShard::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
@@ -1386,7 +1415,13 @@ pub(crate) mod tests {
|
||||
)
|
||||
.unwrap(),
|
||||
policy.clone(),
|
||||
)
|
||||
);
|
||||
|
||||
if let Some(az) = &preferred_az {
|
||||
ts.set_preferred_az(az.clone());
|
||||
}
|
||||
|
||||
ts
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -1397,7 +1432,7 @@ pub(crate) mod tests {
|
||||
fn tenant_ha_scheduling() -> anyhow::Result<()> {
|
||||
// Start with three nodes. Our tenant will only use two. The third one is
|
||||
// expected to remain unused.
|
||||
let mut nodes = make_test_nodes(3);
|
||||
let mut nodes = make_test_nodes(3, &[]);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut context = ScheduleContext::default();
|
||||
@@ -1449,7 +1484,7 @@ pub(crate) mod tests {
|
||||
|
||||
#[test]
|
||||
fn intent_from_observed() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let nodes = make_test_nodes(3, &[]);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
@@ -1499,7 +1534,7 @@ pub(crate) mod tests {
|
||||
|
||||
#[test]
|
||||
fn scheduling_mode() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let nodes = make_test_nodes(3, &[]);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
@@ -1524,7 +1559,7 @@ pub(crate) mod tests {
|
||||
|
||||
#[test]
|
||||
fn optimize_attachment() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let nodes = make_test_nodes(3, &[]);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
@@ -1591,7 +1626,7 @@ pub(crate) mod tests {
|
||||
|
||||
#[test]
|
||||
fn optimize_secondary() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
let nodes = make_test_nodes(4, &[]);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
@@ -1637,12 +1672,14 @@ pub(crate) mod tests {
|
||||
|
||||
// Optimize til quiescent: this emulates what Service::optimize_all does, when
|
||||
// called repeatedly in the background.
|
||||
// Returns the applied optimizations
|
||||
fn optimize_til_idle(
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
scheduler: &mut Scheduler,
|
||||
shards: &mut [TenantShard],
|
||||
) {
|
||||
) -> Vec<ScheduleOptimization> {
|
||||
let mut loop_n = 0;
|
||||
let mut optimizations = Vec::default();
|
||||
loop {
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
let mut any_changed = false;
|
||||
@@ -1657,6 +1694,7 @@ pub(crate) mod tests {
|
||||
for shard in shards.iter_mut() {
|
||||
let optimization = shard.optimize_attachment(nodes, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
optimizations.push(optimization.clone());
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
@@ -1664,6 +1702,7 @@ pub(crate) mod tests {
|
||||
|
||||
let optimization = shard.optimize_secondary(scheduler, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
optimizations.push(optimization.clone());
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
@@ -1678,20 +1717,22 @@ pub(crate) mod tests {
|
||||
loop_n += 1;
|
||||
assert!(loop_n < 1000);
|
||||
}
|
||||
|
||||
optimizations
|
||||
}
|
||||
|
||||
/// Test the balancing behavior of shard scheduling: that it achieves a balance, and
|
||||
/// that it converges.
|
||||
#[test]
|
||||
fn optimize_add_nodes() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
let nodes = make_test_nodes(4, &[]);
|
||||
|
||||
// Only show the scheduler a couple of nodes
|
||||
let mut scheduler = Scheduler::new([].iter());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
|
||||
|
||||
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
|
||||
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for shard in &mut shards {
|
||||
assert!(shard
|
||||
@@ -1730,4 +1771,191 @@ pub(crate) mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test that initial shard scheduling is optimal. By optimal we mean
|
||||
/// that the optimizer cannot find a way to improve it.
|
||||
///
|
||||
/// This test is an example of the scheduling issue described in
|
||||
/// https://github.com/neondatabase/neon/issues/8969
|
||||
#[test]
|
||||
fn initial_scheduling_is_optimal() -> anyhow::Result<()> {
|
||||
use itertools::Itertools;
|
||||
|
||||
let nodes = make_test_nodes(2, &[]);
|
||||
|
||||
let mut scheduler = Scheduler::new([].iter());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
|
||||
|
||||
let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
|
||||
let a_context = Rc::new(RefCell::new(ScheduleContext::default()));
|
||||
|
||||
let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
|
||||
let b_context = Rc::new(RefCell::new(ScheduleContext::default()));
|
||||
|
||||
let a_shards_with_context = a.iter_mut().map(|shard| (shard, a_context.clone()));
|
||||
let b_shards_with_context = b.iter_mut().map(|shard| (shard, b_context.clone()));
|
||||
|
||||
let schedule_order = a_shards_with_context.interleave(b_shards_with_context);
|
||||
|
||||
for (shard, context) in schedule_order {
|
||||
let context = &mut *context.borrow_mut();
|
||||
shard.schedule(&mut scheduler, context).unwrap();
|
||||
}
|
||||
|
||||
let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a);
|
||||
assert_eq!(applied_to_a, vec![]);
|
||||
|
||||
let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b);
|
||||
assert_eq!(applied_to_b, vec![]);
|
||||
|
||||
for shard in a.iter_mut().chain(b.iter_mut()) {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_az_shard_scheduling() -> anyhow::Result<()> {
|
||||
use rand::seq::SliceRandom;
|
||||
|
||||
for seed in 0..50 {
|
||||
eprintln!("Running test with seed {seed}");
|
||||
let mut rng = StdRng::seed_from_u64(seed);
|
||||
|
||||
let az_a_tag = AvailabilityZone("az-a".to_string());
|
||||
let az_b_tag = AvailabilityZone("az-b".to_string());
|
||||
let azs = [az_a_tag, az_b_tag];
|
||||
let nodes = make_test_nodes(4, &azs);
|
||||
let mut shards_per_az: HashMap<AvailabilityZone, u32> = HashMap::new();
|
||||
|
||||
let mut scheduler = Scheduler::new([].iter());
|
||||
for node in nodes.values() {
|
||||
scheduler.node_upsert(node);
|
||||
}
|
||||
|
||||
let mut shards = Vec::default();
|
||||
let mut contexts = Vec::default();
|
||||
let mut az_picker = azs.iter().cycle().cloned();
|
||||
for i in 0..100 {
|
||||
let az = az_picker.next().unwrap();
|
||||
let shard_count = i % 4 + 1;
|
||||
*shards_per_az.entry(az.clone()).or_default() += shard_count;
|
||||
|
||||
let tenant_shards = make_test_tenant(
|
||||
PlacementPolicy::Attached(1),
|
||||
ShardCount::new(shard_count.try_into().unwrap()),
|
||||
Some(az),
|
||||
);
|
||||
let context = Rc::new(RefCell::new(ScheduleContext::default()));
|
||||
|
||||
contexts.push(context.clone());
|
||||
let with_ctx = tenant_shards
|
||||
.into_iter()
|
||||
.map(|shard| (shard, context.clone()));
|
||||
for shard_with_ctx in with_ctx {
|
||||
shards.push(shard_with_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
shards.shuffle(&mut rng);
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct NodeStats {
|
||||
attachments: u32,
|
||||
secondaries: u32,
|
||||
}
|
||||
|
||||
let mut node_stats: HashMap<NodeId, NodeStats> = HashMap::default();
|
||||
let mut attachments_in_wrong_az = 0;
|
||||
let mut secondaries_in_wrong_az = 0;
|
||||
|
||||
for (shard, context) in &mut shards {
|
||||
let context = &mut *context.borrow_mut();
|
||||
shard.schedule(&mut scheduler, context).unwrap();
|
||||
|
||||
let attached_node = shard.intent.get_attached().unwrap();
|
||||
let stats = node_stats.entry(attached_node).or_default();
|
||||
stats.attachments += 1;
|
||||
|
||||
let secondary_node = *shard.intent.get_secondary().first().unwrap();
|
||||
let stats = node_stats.entry(secondary_node).or_default();
|
||||
stats.secondaries += 1;
|
||||
|
||||
let attached_node_az = nodes
|
||||
.get(&attached_node)
|
||||
.unwrap()
|
||||
.get_availability_zone_id();
|
||||
let secondary_node_az = nodes
|
||||
.get(&secondary_node)
|
||||
.unwrap()
|
||||
.get_availability_zone_id();
|
||||
let preferred_az = shard.preferred_az().unwrap();
|
||||
|
||||
if attached_node_az != preferred_az {
|
||||
eprintln!(
|
||||
"{} attachment was scheduled in AZ {} but preferred AZ {}",
|
||||
shard.tenant_shard_id, attached_node_az, preferred_az
|
||||
);
|
||||
attachments_in_wrong_az += 1;
|
||||
}
|
||||
|
||||
if secondary_node_az == preferred_az {
|
||||
eprintln!(
|
||||
"{} secondary was scheduled in AZ {} which matches preference",
|
||||
shard.tenant_shard_id, attached_node_az
|
||||
);
|
||||
secondaries_in_wrong_az += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let mut violations = Vec::default();
|
||||
|
||||
if attachments_in_wrong_az > 0 {
|
||||
violations.push(format!(
|
||||
"{} attachments scheduled to the incorrect AZ",
|
||||
attachments_in_wrong_az
|
||||
));
|
||||
}
|
||||
|
||||
if secondaries_in_wrong_az > 0 {
|
||||
violations.push(format!(
|
||||
"{} secondaries scheduled to the incorrect AZ",
|
||||
secondaries_in_wrong_az
|
||||
));
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
"attachments_in_wrong_az={} secondaries_in_wrong_az={}",
|
||||
attachments_in_wrong_az, secondaries_in_wrong_az
|
||||
);
|
||||
|
||||
for (node_id, stats) in &node_stats {
|
||||
let node_az = nodes.get(node_id).unwrap().get_availability_zone_id();
|
||||
let ideal_attachment_load = shards_per_az.get(node_az).unwrap() / 2;
|
||||
let allowed_attachment_load =
|
||||
(ideal_attachment_load - 1)..(ideal_attachment_load + 2);
|
||||
|
||||
if !allowed_attachment_load.contains(&stats.attachments) {
|
||||
violations.push(format!(
|
||||
"Found {} attachments on node {}, but expected {}",
|
||||
stats.attachments, node_id, ideal_attachment_load
|
||||
));
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
"{}: attachments={} secondaries={} ideal_attachment_load={}",
|
||||
node_id, stats.attachments, stats.secondaries, ideal_attachment_load
|
||||
);
|
||||
}
|
||||
|
||||
assert!(violations.is_empty(), "{violations:?}");
|
||||
|
||||
for (mut shard, _ctx) in shards {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ license.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
either.workspace = true
|
||||
anyhow.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::Context;
|
||||
use itertools::Itertools;
|
||||
use pageserver::tenant::checks::check_valid_layermap;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{info, warn};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -29,9 +28,8 @@ pub(crate) struct TimelineAnalysis {
|
||||
/// yet.
|
||||
pub(crate) warnings: Vec<String>,
|
||||
|
||||
/// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware
|
||||
/// of races between reading the metadata and reading the objects.
|
||||
pub(crate) garbage_keys: Vec<String>,
|
||||
/// Objects whose keys were not recognized at all, i.e. not layer files, not indices, and not initdb archive.
|
||||
pub(crate) unknown_keys: Vec<String>,
|
||||
}
|
||||
|
||||
impl TimelineAnalysis {
|
||||
@@ -39,7 +37,7 @@ impl TimelineAnalysis {
|
||||
Self {
|
||||
errors: Vec::new(),
|
||||
warnings: Vec::new(),
|
||||
garbage_keys: Vec::new(),
|
||||
unknown_keys: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +57,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
) -> TimelineAnalysis {
|
||||
let mut result = TimelineAnalysis::new();
|
||||
|
||||
info!("Checking timeline {id}");
|
||||
info!("Checking timeline");
|
||||
|
||||
if let Some(s3_active_branch) = s3_active_branch {
|
||||
info!(
|
||||
@@ -80,7 +78,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
match s3_data {
|
||||
Some(s3_data) => {
|
||||
result
|
||||
.garbage_keys
|
||||
.unknown_keys
|
||||
.extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
|
||||
|
||||
match s3_data.blob_data {
|
||||
@@ -204,10 +202,10 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
warn!("Timeline metadata warnings: {0:?}", result.warnings);
|
||||
}
|
||||
|
||||
if !result.garbage_keys.is_empty() {
|
||||
error!(
|
||||
"The following keys should be removed from S3: {0:?}",
|
||||
result.garbage_keys
|
||||
if !result.unknown_keys.is_empty() {
|
||||
warn!(
|
||||
"The following keys are not recognized: {0:?}",
|
||||
result.unknown_keys
|
||||
)
|
||||
}
|
||||
|
||||
@@ -294,10 +292,10 @@ impl TenantObjectListing {
|
||||
pub(crate) struct RemoteTimelineBlobData {
|
||||
pub(crate) blob_data: BlobDataParseResult,
|
||||
|
||||
// Index objects that were not used when loading `blob_data`, e.g. those from old generations
|
||||
/// Index objects that were not used when loading `blob_data`, e.g. those from old generations
|
||||
pub(crate) unused_index_keys: Vec<ListingObject>,
|
||||
|
||||
// Objects whose keys were not recognized at all, i.e. not layer files, not indices
|
||||
/// Objects whose keys were not recognized at all, i.e. not layer files, not indices
|
||||
pub(crate) unknown_keys: Vec<ListingObject>,
|
||||
}
|
||||
|
||||
@@ -329,11 +327,54 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
|
||||
}
|
||||
}
|
||||
|
||||
/// Note (<https://github.com/neondatabase/neon/issues/8872>):
|
||||
/// Since we do not gurantee the order of the listing, we could list layer keys right before
|
||||
/// pageserver `RemoteTimelineClient` deletes the layer files and then the index.
|
||||
/// In the rare case, this would give back a transient error where the index key is missing.
|
||||
///
|
||||
/// To avoid generating false positive, we try streaming the listing for a second time.
|
||||
pub(crate) async fn list_timeline_blobs(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
id: TenantShardTimelineId,
|
||||
root_target: &RootTarget,
|
||||
) -> anyhow::Result<RemoteTimelineBlobData> {
|
||||
let res = list_timeline_blobs_impl(remote_client, id, root_target).await?;
|
||||
match res {
|
||||
ListTimelineBlobsResult::Ready(data) => Ok(data),
|
||||
ListTimelineBlobsResult::MissingIndexPart(_) => {
|
||||
// Retry if index is missing.
|
||||
let data = list_timeline_blobs_impl(remote_client, id, root_target)
|
||||
.await?
|
||||
.into_data();
|
||||
Ok(data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum ListTimelineBlobsResult {
|
||||
/// Blob data is ready to be intepreted.
|
||||
Ready(RemoteTimelineBlobData),
|
||||
/// List timeline blobs has layer files but is missing [`IndexPart`].
|
||||
MissingIndexPart(RemoteTimelineBlobData),
|
||||
}
|
||||
|
||||
impl ListTimelineBlobsResult {
|
||||
/// Get the inner blob data regardless the status.
|
||||
pub fn into_data(self) -> RemoteTimelineBlobData {
|
||||
match self {
|
||||
ListTimelineBlobsResult::Ready(data) => data,
|
||||
ListTimelineBlobsResult::MissingIndexPart(data) => data,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns [`ListTimelineBlobsResult::MissingIndexPart`] if blob data has layer files
|
||||
/// but is missing [`IndexPart`], otherwise returns [`ListTimelineBlobsResult::Ready`].
|
||||
async fn list_timeline_blobs_impl(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
id: TenantShardTimelineId,
|
||||
root_target: &RootTarget,
|
||||
) -> anyhow::Result<ListTimelineBlobsResult> {
|
||||
let mut s3_layers = HashSet::new();
|
||||
|
||||
let mut errors = Vec::new();
|
||||
@@ -375,30 +416,28 @@ pub(crate) async fn list_timeline_blobs(
|
||||
s3_layers.insert((new_layer, gen));
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::info!("Error parsing key {maybe_layer_name}");
|
||||
errors.push(
|
||||
format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
|
||||
);
|
||||
tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}");
|
||||
unknown_keys.push(obj);
|
||||
}
|
||||
},
|
||||
None => {
|
||||
tracing::warn!("Unknown key {key}");
|
||||
errors.push(format!("S3 list response got an object with odd key {key}"));
|
||||
tracing::info!("S3 listed an unknown key: {key}");
|
||||
unknown_keys.push(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
|
||||
tracing::debug!(
|
||||
"Timeline is empty apart from initdb archive: expected post-deletion state."
|
||||
);
|
||||
return Ok(RemoteTimelineBlobData {
|
||||
if index_part_keys.is_empty() && s3_layers.is_empty() {
|
||||
tracing::debug!("Timeline is empty: expected post-deletion state.");
|
||||
if initdb_archive {
|
||||
tracing::info!("Timeline is post deletion but initdb archive is still present.");
|
||||
}
|
||||
|
||||
return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Relic,
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys: Vec::new(),
|
||||
});
|
||||
unknown_keys,
|
||||
}));
|
||||
}
|
||||
|
||||
// Choose the index_part with the highest generation
|
||||
@@ -424,19 +463,43 @@ pub(crate) async fn list_timeline_blobs(
|
||||
match index_part_object.as_ref() {
|
||||
Some(selected) => index_part_keys.retain(|k| k != selected),
|
||||
None => {
|
||||
errors.push("S3 list response got no index_part.json file".to_string());
|
||||
// It is possible that the branch gets deleted after we got some layer files listed
|
||||
// and we no longer have the index file in the listing.
|
||||
errors.push(
|
||||
"S3 list response got no index_part.json file but still has layer files"
|
||||
.to_string(),
|
||||
);
|
||||
return Ok(ListTimelineBlobsResult::MissingIndexPart(
|
||||
RemoteTimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(index_part_object_key) = index_part_object.as_ref() {
|
||||
let index_part_bytes =
|
||||
download_object_with_retries(remote_client, &index_part_object_key.key)
|
||||
.await
|
||||
.context("index_part.json download")?;
|
||||
match download_object_with_retries(remote_client, &index_part_object_key.key).await {
|
||||
Ok(index_part_bytes) => index_part_bytes,
|
||||
Err(e) => {
|
||||
// It is possible that the branch gets deleted in-between we list the objects
|
||||
// and we download the index part file.
|
||||
errors.push(format!("failed to download index_part.json: {e}"));
|
||||
return Ok(ListTimelineBlobsResult::MissingIndexPart(
|
||||
RemoteTimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
},
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
match serde_json::from_slice(&index_part_bytes) {
|
||||
Ok(index_part) => {
|
||||
return Ok(RemoteTimelineBlobData {
|
||||
return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Parsed {
|
||||
index_part: Box::new(index_part),
|
||||
index_part_generation,
|
||||
@@ -444,7 +507,7 @@ pub(crate) async fn list_timeline_blobs(
|
||||
},
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
})
|
||||
}))
|
||||
}
|
||||
Err(index_parse_error) => errors.push(format!(
|
||||
"index_part.json body parsing error: {index_parse_error}"
|
||||
@@ -458,9 +521,9 @@ pub(crate) async fn list_timeline_blobs(
|
||||
);
|
||||
}
|
||||
|
||||
Ok(RemoteTimelineBlobData {
|
||||
Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -41,6 +41,10 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
/// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'.
|
||||
controller_jwt: Option<String>,
|
||||
|
||||
/// If set to true, the scrubber will exit with error code on fatal error.
|
||||
#[arg(long, default_value_t = false)]
|
||||
exit_code: bool,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
@@ -203,6 +207,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
tenant_ids,
|
||||
json,
|
||||
post_to_storcon,
|
||||
cli.exit_code,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -269,6 +274,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
gc_min_age,
|
||||
gc_mode,
|
||||
post_to_storcon,
|
||||
cli.exit_code,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -284,6 +290,7 @@ pub async fn run_cron_job(
|
||||
gc_min_age: humantime::Duration,
|
||||
gc_mode: GcMode,
|
||||
post_to_storcon: bool,
|
||||
exit_code: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
|
||||
pageserver_physical_gc_cmd(
|
||||
@@ -301,6 +308,7 @@ pub async fn run_cron_job(
|
||||
Vec::new(),
|
||||
true,
|
||||
post_to_storcon,
|
||||
exit_code,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -349,6 +357,7 @@ pub async fn scan_pageserver_metadata_cmd(
|
||||
tenant_shard_ids: Vec<TenantShardId>,
|
||||
json: bool,
|
||||
post_to_storcon: bool,
|
||||
exit_code: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
if controller_client.is_none() && post_to_storcon {
|
||||
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||
@@ -380,6 +389,9 @@ pub async fn scan_pageserver_metadata_cmd(
|
||||
|
||||
if summary.is_fatal() {
|
||||
tracing::error!("Fatal scrub errors detected");
|
||||
if exit_code {
|
||||
std::process::exit(1);
|
||||
}
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
@@ -391,6 +403,9 @@ pub async fn scan_pageserver_metadata_cmd(
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
);
|
||||
if exit_code {
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -12,6 +12,7 @@ use pageserver_api::controller_api::MetadataHealthUpdateRequest;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::Serialize;
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
@@ -169,45 +170,54 @@ pub async fn scan_pageserver_metadata(
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut timeline_generations = HashMap::new();
|
||||
for (ttid, data) in timelines {
|
||||
if ttid.tenant_shard_id.shard_count == highest_shard_count {
|
||||
// Only analyze `TenantShardId`s with highest shard count.
|
||||
async {
|
||||
if ttid.tenant_shard_id.shard_count == highest_shard_count {
|
||||
// Only analyze `TenantShardId`s with highest shard count.
|
||||
|
||||
// Stash the generation of each timeline, for later use identifying orphan layers
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} = &data.blob_data
|
||||
{
|
||||
if index_part.deleted_at.is_some() {
|
||||
// skip deleted timeline.
|
||||
tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid);
|
||||
continue;
|
||||
// Stash the generation of each timeline, for later use identifying orphan layers
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} = &data.blob_data
|
||||
{
|
||||
if index_part.deleted_at.is_some() {
|
||||
// skip deleted timeline.
|
||||
tracing::info!(
|
||||
"Skip analysis of {} b/c timeline is already deleted",
|
||||
ttid
|
||||
);
|
||||
return;
|
||||
}
|
||||
timeline_generations.insert(ttid, *index_part_generation);
|
||||
}
|
||||
timeline_generations.insert(ttid, *index_part_generation);
|
||||
|
||||
// Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
|
||||
// reference counts for layers across the tenant.
|
||||
let analysis = branch_cleanup_and_check_errors(
|
||||
remote_client,
|
||||
&ttid,
|
||||
&mut tenant_objects,
|
||||
None,
|
||||
None,
|
||||
Some(data),
|
||||
)
|
||||
.await;
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
|
||||
timeline_ids.insert(ttid.timeline_id);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Skip analysis of {} b/c a lower shard count than {}",
|
||||
ttid,
|
||||
highest_shard_count.0,
|
||||
);
|
||||
}
|
||||
|
||||
// Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
|
||||
// reference counts for layers across the tenant.
|
||||
let analysis = branch_cleanup_and_check_errors(
|
||||
remote_client,
|
||||
&ttid,
|
||||
&mut tenant_objects,
|
||||
None,
|
||||
None,
|
||||
Some(data),
|
||||
)
|
||||
.await;
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
|
||||
timeline_ids.insert(ttid.timeline_id);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Skip analysis of {} b/c a lower shard count than {}",
|
||||
ttid,
|
||||
highest_shard_count.0,
|
||||
);
|
||||
}
|
||||
.instrument(
|
||||
info_span!("analyze-timeline", shard = %ttid.tenant_shard_id.shard_slug(), timeline = %ttid.timeline_id),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
summary.timeline_count += timeline_ids.len();
|
||||
@@ -278,6 +288,7 @@ pub async fn scan_pageserver_metadata(
|
||||
timelines,
|
||||
highest_shard_count,
|
||||
)
|
||||
.instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
|
||||
.await;
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
highest_shard_count = ttid.tenant_shard_id.shard_count;
|
||||
@@ -306,15 +317,18 @@ pub async fn scan_pageserver_metadata(
|
||||
tenant_timeline_results.push((ttid, data));
|
||||
}
|
||||
|
||||
let tenant_id = tenant_id.expect("Must be set if results are present");
|
||||
|
||||
if !tenant_timeline_results.is_empty() {
|
||||
analyze_tenant(
|
||||
&remote_client,
|
||||
tenant_id.expect("Must be set if results are present"),
|
||||
tenant_id,
|
||||
&mut summary,
|
||||
tenant_objects,
|
||||
tenant_timeline_results,
|
||||
highest_shard_count,
|
||||
)
|
||||
.instrument(info_span!("analyze-tenant", tenant = %tenant_id))
|
||||
.await;
|
||||
}
|
||||
|
||||
|
||||
100
test_runner/cloud_regress/test_cloud_regress.py
Normal file
100
test_runner/cloud_regress/test_cloud_regress.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Run the regression tests on the cloud instance of Neon
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import RemotePostgres
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup(remote_pg: RemotePostgres):
|
||||
"""
|
||||
Setup and teardown of the tests
|
||||
"""
|
||||
with psycopg2.connect(remote_pg.connstr()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
log.info("Creating the extension")
|
||||
cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
|
||||
conn.commit()
|
||||
# TODO: Migrate to branches and remove this code
|
||||
log.info("Looking for subscriptions in the regress database")
|
||||
cur.execute(
|
||||
"SELECT subname FROM pg_catalog.pg_subscription WHERE "
|
||||
"subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
|
||||
)
|
||||
if cur.rowcount > 0:
|
||||
with psycopg2.connect(
|
||||
dbname="regression",
|
||||
host=remote_pg.default_options["host"],
|
||||
user=remote_pg.default_options["user"],
|
||||
password=remote_pg.default_options["password"],
|
||||
) as regress_conn:
|
||||
with regress_conn.cursor() as regress_cur:
|
||||
for sub in cur:
|
||||
regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
|
||||
regress_cur.execute(
|
||||
f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
|
||||
)
|
||||
regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
|
||||
regress_conn.commit()
|
||||
|
||||
yield
|
||||
# TODO: Migrate to branches and remove this code
|
||||
log.info("Looking for extra roles...")
|
||||
with psycopg2.connect(remote_pg.connstr()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
|
||||
)
|
||||
roles: list[Any] = []
|
||||
for role in cur:
|
||||
log.info("Role found: %s", role[0])
|
||||
roles.append(role[0])
|
||||
for role in roles:
|
||||
cur.execute(f"DROP ROLE {role}")
|
||||
conn.commit()
|
||||
|
||||
|
||||
@pytest.mark.timeout(7200)
|
||||
@pytest.mark.remote_cluster
|
||||
def test_cloud_regress(
|
||||
setup,
|
||||
remote_pg: RemotePostgres,
|
||||
pg_version: PgVersion,
|
||||
pg_distrib_dir: Path,
|
||||
base_dir: Path,
|
||||
test_output_dir: Path,
|
||||
):
|
||||
"""
|
||||
Run the regression tests
|
||||
"""
|
||||
regress_bin = (
|
||||
pg_distrib_dir / f"{pg_version.v_prefixed}/lib/postgresql/pgxs/src/test/regress/pg_regress"
|
||||
)
|
||||
test_path = base_dir / f"vendor/postgres-{pg_version.v_prefixed}/src/test/regress"
|
||||
|
||||
env_vars = {
|
||||
"PGHOST": remote_pg.default_options["host"],
|
||||
"PGPORT": str(
|
||||
remote_pg.default_options["port"] if "port" in remote_pg.default_options else 5432
|
||||
),
|
||||
"PGUSER": remote_pg.default_options["user"],
|
||||
"PGPASSWORD": remote_pg.default_options["password"],
|
||||
"PGDATABASE": remote_pg.default_options["dbname"],
|
||||
}
|
||||
regress_cmd = [
|
||||
str(regress_bin),
|
||||
f"--inputdir={test_path}",
|
||||
f"--bindir={pg_distrib_dir}/{pg_version.v_prefixed}/bin",
|
||||
"--dlpath=/usr/local/lib",
|
||||
"--max-concurrent-tests=20",
|
||||
f"--schedule={test_path}/parallel_schedule",
|
||||
"--max-connections=5",
|
||||
]
|
||||
remote_pg.pg_bin.run(regress_cmd, env=env_vars, cwd=test_output_dir)
|
||||
@@ -102,6 +102,11 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
|
||||
return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
|
||||
|
||||
|
||||
def counter(name: str) -> str:
|
||||
# the prometheus_client package appends _total to all counters client-side
|
||||
return f"{name}_total"
|
||||
|
||||
|
||||
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_remote_timeline_client_calls_started_total",
|
||||
"pageserver_remote_timeline_client_calls_finished_total",
|
||||
@@ -132,9 +137,14 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
*histogram("pageserver_wait_lsn_seconds"),
|
||||
*histogram("pageserver_remote_operation_seconds"),
|
||||
*histogram("pageserver_io_operations_seconds"),
|
||||
"pageserver_smgr_query_started_global_count_total",
|
||||
"pageserver_tenant_states_count",
|
||||
"pageserver_circuit_breaker_broken_total",
|
||||
"pageserver_circuit_breaker_unbroken_total",
|
||||
counter("pageserver_tenant_throttling_count_accounted_start_global"),
|
||||
counter("pageserver_tenant_throttling_count_accounted_finish_global"),
|
||||
counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
|
||||
counter("pageserver_tenant_throttling_count_global"),
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
@@ -146,6 +156,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_smgr_query_seconds_bucket",
|
||||
"pageserver_smgr_query_seconds_count",
|
||||
"pageserver_smgr_query_seconds_sum",
|
||||
"pageserver_smgr_query_started_count_total",
|
||||
"pageserver_archive_size",
|
||||
"pageserver_pitr_history_size",
|
||||
"pageserver_layer_bytes",
|
||||
@@ -157,6 +168,10 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
"pageserver_aux_file_estimated_size",
|
||||
"pageserver_valid_lsn_lease_count",
|
||||
counter("pageserver_tenant_throttling_count_accounted_start"),
|
||||
counter("pageserver_tenant_throttling_count_accounted_finish"),
|
||||
counter("pageserver_tenant_throttling_wait_usecs_sum"),
|
||||
counter("pageserver_tenant_throttling_count"),
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# "pageserver_directory_entries_count", -- only used if above a certain threshold
|
||||
# "pageserver_broken_tenants_count" -- used only for broken
|
||||
|
||||
@@ -642,9 +642,6 @@ class NeonEnvBuilder:
|
||||
patch_script = ""
|
||||
for ps in self.env.pageservers:
|
||||
patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';"
|
||||
# This is a temporary to get the backward compat test happy
|
||||
# since the compat snapshot was generated with an older version of neon local
|
||||
patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}' WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;"
|
||||
patch_script_path.write_text(patch_script)
|
||||
|
||||
# Update the config with info about tenants and timelines
|
||||
@@ -849,7 +846,7 @@ class NeonEnvBuilder:
|
||||
|
||||
for directory_to_clean in reversed(directories_to_clean):
|
||||
if not os.listdir(directory_to_clean):
|
||||
log.info(f"Removing empty directory {directory_to_clean}")
|
||||
log.debug(f"Removing empty directory {directory_to_clean}")
|
||||
try:
|
||||
directory_to_clean.rmdir()
|
||||
except Exception as e:
|
||||
@@ -2553,7 +2550,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
desired_availability: Optional[PageserverAvailability],
|
||||
desired_scheduling_policy: Optional[PageserverSchedulingPolicy],
|
||||
max_attempts: int,
|
||||
backoff: int,
|
||||
backoff: float,
|
||||
):
|
||||
"""
|
||||
Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability'
|
||||
@@ -2948,7 +2945,7 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
self.id
|
||||
):
|
||||
self.env.storage_controller.poll_node_status(
|
||||
self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1
|
||||
self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1
|
||||
)
|
||||
|
||||
return self
|
||||
@@ -4617,7 +4614,8 @@ class StorageScrubber:
|
||||
"REGION": s3_storage.bucket_region,
|
||||
"BUCKET": s3_storage.bucket_name,
|
||||
"BUCKET_PREFIX": s3_storage.prefix_in_bucket,
|
||||
"RUST_LOG": "DEBUG",
|
||||
"RUST_LOG": "INFO",
|
||||
"PAGESERVER_DISABLE_FILE_LOGGING": "1",
|
||||
}
|
||||
env.update(s3_storage.access_env_vars())
|
||||
|
||||
@@ -4637,10 +4635,8 @@ class StorageScrubber:
|
||||
(output_path, stdout, status_code) = subprocess_capture(
|
||||
self.log_dir,
|
||||
args,
|
||||
echo_stderr=True,
|
||||
echo_stdout=True,
|
||||
env=env,
|
||||
check=False,
|
||||
check=True,
|
||||
capture_stdout=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
@@ -631,7 +631,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
log.info(
|
||||
f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
|
||||
)
|
||||
res = self.post(
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
|
||||
json=config,
|
||||
)
|
||||
|
||||
@@ -236,7 +236,7 @@ def get_scale_for_db(size_mb: int) -> int:
|
||||
|
||||
|
||||
ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
|
||||
r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
|
||||
r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
|
||||
)
|
||||
|
||||
|
||||
|
||||
21
test_runner/regress/test_compute_metrics.py
Normal file
21
test_runner/regress/test_compute_metrics.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_compute_metrics(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test compute metrics, exposed in the neon_backend_perf_counters and
|
||||
neon_perf_counters views
|
||||
"""
|
||||
env = neon_simple_env
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
# We don't check that the values make sense, this is just a very
|
||||
# basic check that the server doesn't crash or something like that.
|
||||
#
|
||||
# 1.5 is the minimum version to contain these views.
|
||||
cur.execute("CREATE EXTENSION neon VERSION '1.5'")
|
||||
cur.execute("SELECT * FROM neon_perf_counters")
|
||||
cur.execute("SELECT * FROM neon_backend_perf_counters")
|
||||
@@ -198,9 +198,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
|
||||
def run_pgbench(connstr: str, pg_bin: PgBin):
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
# s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
|
||||
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr])
|
||||
log.info("pgbench init done")
|
||||
pg_bin.run_capture(["pgbench", "-T60", connstr])
|
||||
|
||||
|
||||
@@ -247,9 +244,15 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
log.info(
|
||||
f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
|
||||
)
|
||||
|
||||
# s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
|
||||
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", primary.connstr()])
|
||||
log.info("pgbench init done in primary")
|
||||
|
||||
t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
|
||||
t.start()
|
||||
# Wait until pgbench_accounts is created + filled on replica *and*
|
||||
|
||||
# Wait until we see that the pgbench_accounts is created + filled on replica *and*
|
||||
# index is created. Otherwise index creation would conflict with
|
||||
# read queries and hs feedback won't save us.
|
||||
wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user