mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 22:20:37 +00:00
Compare commits
12 Commits
release-75
...
bodobolero
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b774e1655d | ||
|
|
4ab4510a84 | ||
|
|
6c0cf1c1cc | ||
|
|
95d50f918a | ||
|
|
30ba4ac50b | ||
|
|
2aef4a93e0 | ||
|
|
6638abda65 | ||
|
|
306017ebc7 | ||
|
|
8467fe9b05 | ||
|
|
08c2021881 | ||
|
|
4975dfd3cf | ||
|
|
5b26876ac7 |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -25,4 +25,3 @@ config-variables:
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- SLACK_ON_CALL_QA_STAGING_STREAM
|
||||
- DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
|
||||
- SLACK_ON_CALL_STORAGE_STAGING_STREAM
|
||||
|
||||
48
.github/actions/neon-project-create/action.yml
vendored
48
.github/actions/neon-project-create/action.yml
vendored
@@ -17,6 +17,31 @@ inputs:
|
||||
compute_units:
|
||||
description: '[Min, Max] compute units'
|
||||
default: '[1, 1]'
|
||||
# settings below only needed if you want the project to be sharded from the beginning
|
||||
shard_split_project:
|
||||
description: 'by default new projects are not shard-split, specify true to shard-split'
|
||||
required: false
|
||||
default: 'false'
|
||||
admin_api_key:
|
||||
description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true'
|
||||
required: false
|
||||
shard_count:
|
||||
description: 'Number of shards to split the project into, only applies if shard_split_project is true'
|
||||
required: false
|
||||
default: '8'
|
||||
stripe_size:
|
||||
description: 'Stripe size, optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true'
|
||||
required: false
|
||||
default: '32768'
|
||||
psql_path:
|
||||
description: 'Path to psql binary - it is caller responsibility to provision the psql binary'
|
||||
required: false
|
||||
default: '/tmp/neon/pg_install/v16/bin/psql'
|
||||
libpq_lib_path:
|
||||
description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
|
||||
required: false
|
||||
default: '/tmp/neon/pg_install/v16/lib'
|
||||
|
||||
|
||||
outputs:
|
||||
dsn:
|
||||
@@ -63,6 +88,23 @@ runs:
|
||||
echo "project_id=${project_id}" >> $GITHUB_OUTPUT
|
||||
|
||||
echo "Project ${project_id} has been created"
|
||||
|
||||
if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
|
||||
# determine tenant ID
|
||||
TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
|
||||
|
||||
echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
|
||||
|
||||
echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
|
||||
echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
|
||||
|
||||
# we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
|
||||
curl -X PUT \
|
||||
"https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
|
||||
-H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||
-d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
|
||||
fi
|
||||
|
||||
env:
|
||||
API_HOST: ${{ inputs.api_host }}
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
@@ -70,3 +112,9 @@ runs:
|
||||
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
||||
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
||||
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
||||
SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
|
||||
ADMIN_API_KEY: ${{ inputs.admin_api_key }}
|
||||
SHARD_COUNT: ${{ inputs.shard_count }}
|
||||
STRIPE_SIZE: ${{ inputs.stripe_size }}
|
||||
PSQL: ${{ inputs.psql_path }}
|
||||
LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
|
||||
|
||||
91
.github/workflows/_check-codestyle-rust.yml
vendored
91
.github/workflows/_check-codestyle-rust.yml
vendored
@@ -1,91 +0,0 @@
|
||||
name: Check Codestyle Rust
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build-tools-image:
|
||||
description: "build-tools image"
|
||||
required: true
|
||||
type: string
|
||||
archs:
|
||||
description: "Json array of architectures to run on"
|
||||
type: string
|
||||
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
strategy:
|
||||
matrix:
|
||||
arch: ${{ fromJson(inputs.archs) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
101
.github/workflows/build_and_test.yml
vendored
101
.github/workflows/build_and_test.yml
vendored
@@ -164,11 +164,77 @@ jobs:
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
archs: '["x64", "arm64"]'
|
||||
secrets: inherit
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
@@ -280,22 +346,25 @@ jobs:
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
report-benchmarks-results-to-slack:
|
||||
report-benchmarks-failures:
|
||||
needs: [ benchmarks, create-test-report ]
|
||||
if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result)
|
||||
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: slackapi/slack-github-action@v2
|
||||
- uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}"
|
||||
text: |
|
||||
Benchmarks on main: *${{ needs.benchmarks.result }}*
|
||||
- <${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
||||
- <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
||||
channel-id: C060CNA47S9 # on-call-staging-storage-stream
|
||||
slack-message: |
|
||||
Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
||||
<${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||
|
||||
30
.github/workflows/ingest_benchmark.yml
vendored
30
.github/workflows/ingest_benchmark.yml
vendored
@@ -28,7 +28,31 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
target_project: [new_empty_project, large_existing_project]
|
||||
include:
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 128 # 1 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 256 # 2 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 512 # 4 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 1024 # 8 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 2048 # 16 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 4096 # 32 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 8192 # 64 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 16384 # 128 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
|
||||
# while here it is sharded from the beginning with a shard size of 256 MiB
|
||||
- target_project: new_empty_project
|
||||
stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
|
||||
- target_project: large_existing_project
|
||||
stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
|
||||
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
@@ -75,6 +99,10 @@ jobs:
|
||||
postgres_version: 16
|
||||
compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }}
|
||||
admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}
|
||||
shard_count: 8
|
||||
stripe_size: ${{ matrix.stripe_size }}
|
||||
|
||||
- name: Initialize Neon project
|
||||
if: ${{ matrix.target_project == 'new_empty_project' }}
|
||||
|
||||
33
.github/workflows/pre-merge-checks.yml
vendored
33
.github/workflows/pre-merge-checks.yml
vendored
@@ -1,12 +1,6 @@
|
||||
name: Pre-merge checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/_check-codestyle-python.yml
|
||||
- .github/workflows/_check-codestyle-rust.yml
|
||||
- .github/workflows/build-build-tools-image.yml
|
||||
- .github/workflows/pre-merge-checks.yml
|
||||
merge_group:
|
||||
branches:
|
||||
- main
|
||||
@@ -23,10 +17,8 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
python-changed: ${{ steps.python-src.outputs.any_changed }}
|
||||
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: python-src
|
||||
with:
|
||||
@@ -38,25 +30,11 @@ jobs:
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: rust-src
|
||||
with:
|
||||
files: |
|
||||
.github/workflows/_check-codestyle-rust.yml
|
||||
.github/workflows/build-build-tools-image.yml
|
||||
.github/workflows/pre-merge-checks.yml
|
||||
**/**.rs
|
||||
**/Cargo.toml
|
||||
Cargo.toml
|
||||
Cargo.lock
|
||||
|
||||
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
|
||||
env:
|
||||
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
|
||||
RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
|
||||
run: |
|
||||
echo "${PYTHON_CHANGED_FILES}"
|
||||
echo "${RUST_CHANGED_FILES}"
|
||||
|
||||
build-build-tools-image:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
@@ -77,16 +55,6 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-rust:
|
||||
if: needs.get-changed-files.outputs.rust-changed == 'true'
|
||||
needs: [ get-changed-files, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
archs: '["x64"]'
|
||||
secrets: inherit
|
||||
|
||||
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
|
||||
# Currently we require 2 jobs (checks with exact name):
|
||||
# - conclusion
|
||||
@@ -99,7 +67,6 @@ jobs:
|
||||
needs:
|
||||
- get-changed-files
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Create fake `neon-cloud-e2e` check
|
||||
|
||||
19
Cargo.lock
generated
19
Cargo.lock
generated
@@ -1312,7 +1312,6 @@ dependencies = [
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"vm_monitor",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
@@ -3982,11 +3981,9 @@ name = "pagectl"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
"camino",
|
||||
"clap",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
@@ -4008,7 +4005,6 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -4048,7 +4044,6 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"postgres_initdb",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
@@ -5659,7 +5654,6 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"desim",
|
||||
"env_logger 0.10.2",
|
||||
"fail",
|
||||
"futures",
|
||||
"hex",
|
||||
@@ -5688,7 +5682,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
@@ -5713,13 +5706,10 @@ dependencies = [
|
||||
name = "safekeeper_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"const_format",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"utils",
|
||||
]
|
||||
@@ -7567,21 +7557,12 @@ dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"criterion",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"prost",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
|
||||
@@ -115,7 +115,7 @@ RUN set -e \
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.17.0
|
||||
ENV SQL_EXPORTER_VERSION=0.16.0
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
|
||||
@@ -66,7 +66,6 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
@@ -105,18 +104,16 @@ RUN cd postgres && \
|
||||
esac; \
|
||||
done;
|
||||
|
||||
# Set PATH for all the subsequent build steps
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "postgis-build"
|
||||
# Build PostGIS from the upstream PostGIS mirror.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS postgis-build
|
||||
FROM build-deps AS postgis-build
|
||||
ARG DEBIAN_VERSION
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y \
|
||||
gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
|
||||
@@ -154,6 +151,8 @@ RUN case "${DEBIAN_VERSION}" in \
|
||||
DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
ninja clean && cp -R /sfcgal/* /
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
# Postgis 3.5.0 supports v17
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
@@ -228,8 +227,9 @@ RUN case "${PG_VERSION}" in \
|
||||
# Build plv8
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS plv8-build
|
||||
FROM build-deps AS plv8-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
|
||||
|
||||
@@ -264,6 +264,7 @@ RUN case "${PG_VERSION}" in \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
@@ -290,8 +291,9 @@ RUN case "${PG_VERSION}" in \
|
||||
# Build h3_pg
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS h3-pg-build
|
||||
FROM build-deps AS h3-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v4.1.0 - Jan 18, 2023
|
||||
@@ -312,6 +314,7 @@ RUN mkdir -p /h3/usr/ && \
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
|
||||
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||
@@ -323,16 +326,17 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
|
||||
# compile unit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS unit-pg-build
|
||||
FROM build-deps AS unit-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release 7.9 - Sep 15, 2024
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
|
||||
echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
|
||||
# This one-liner removes pgsql/ part of the path.
|
||||
@@ -346,8 +350,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
|
||||
# compile pgvector extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS vector-pg-build
|
||||
FROM build-deps AS vector-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
|
||||
@@ -361,8 +366,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
|
||||
echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -371,15 +376,16 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pgjwt-pg-build
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# doesn't use releases, last commit f3d82fd - Mar 2, 2023
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
|
||||
echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -388,16 +394,17 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
|
||||
# compile hypopg extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS hypopg-pg-build
|
||||
FROM build-deps AS hypopg-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# HypoPG 1.4.1 supports v17
|
||||
# last release 1.4.1 - Apr 28, 2024
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
|
||||
echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -406,16 +413,17 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
|
||||
# compile pg_hashids extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-hashids-pg-build
|
||||
FROM build-deps AS pg-hashids-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.2.1 -Jan 12, 2018
|
||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
||||
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -424,8 +432,9 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
|
||||
# compile rum extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rum-pg-build
|
||||
FROM build-deps AS rum-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/rum.patch /rum.patch
|
||||
|
||||
@@ -436,8 +445,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
|
||||
echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /rum.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -446,16 +455,17 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
|
||||
# compile pgTAP extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pgtap-pg-build
|
||||
FROM build-deps AS pgtap-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# pgtap 1.3.3 supports v17
|
||||
# last release v1.3.3 - Apr 8, 2024
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
|
||||
echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -464,16 +474,17 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
|
||||
# compile ip4r extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS ip4r-pg-build
|
||||
FROM build-deps AS ip4r-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v2.4.2 - Jul 29, 2023
|
||||
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
|
||||
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
|
||||
mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -482,16 +493,17 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
|
||||
# compile Prefix extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS prefix-pg-build
|
||||
FROM build-deps AS prefix-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.2.10 - Jul 5, 2023
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
|
||||
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
|
||||
mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -500,16 +512,17 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
|
||||
# compile hll extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS hll-pg-build
|
||||
FROM build-deps AS hll-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v2.18 - Aug 29, 2023
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
|
||||
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
|
||||
mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -518,16 +531,17 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
|
||||
# compile plpgsql_check extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS plpgsql-check-pg-build
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# plpgsql_check v2.7.11 supports v17
|
||||
# last release v2.7.11 - Sep 16, 2024
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -536,8 +550,11 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
|
||||
# compile timescaledb extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS timescaledb-pg-build
|
||||
FROM build-deps AS timescaledb-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
@@ -568,8 +585,11 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_hint_plan extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-hint-plan-pg-build
|
||||
FROM build-deps AS pg-hint-plan-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
# version-specific, has separate releases for each version
|
||||
RUN case "${PG_VERSION}" in \
|
||||
@@ -607,12 +627,14 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_cron extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-cron-pg-build
|
||||
FROM build-deps AS pg-cron-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is an experimental extension that we do not support on prod yet.
|
||||
# !Do not remove!
|
||||
# We set it in shared_preload_libraries and computes will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
@@ -626,8 +648,9 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
|
||||
# compile rdkit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rdkit-pg-build
|
||||
FROM build-deps AS rdkit-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y \
|
||||
@@ -645,13 +668,7 @@ RUN apt update && \
|
||||
# Use new version only for v17
|
||||
# because Release_2024_09_1 has some backward incompatible changes
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
|
||||
# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
|
||||
# pg_config. For some reason the rdkit cmake script doesn't work with just that,
|
||||
# however. By also adding /usr/local/pgsql, it works, which is weird because there
|
||||
# are no executables in that directory.
|
||||
ENV PATH="/usr/local/pgsql:$PATH"
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
export RDKIT_VERSION=Release_2024_09_1 \
|
||||
@@ -704,11 +721,13 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_uuidv7 extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-uuidv7-pg-build
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.6.0 - Oct 9, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
@@ -722,11 +741,13 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
|
||||
# compile pg_roaringbitmap extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-roaringbitmap-pg-build
|
||||
FROM build-deps AS pg-roaringbitmap-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v0.5.4 - Jun 28, 2022
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
@@ -740,14 +761,16 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-semver-pg-build
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# Release 0.40.0 breaks backward compatibility with previous versions
|
||||
# see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
|
||||
# Use new version only for v17
|
||||
#
|
||||
# last release v0.40.0 - Jul 22, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
export SEMVER_VERSION=0.40.0 \
|
||||
@@ -774,11 +797,13 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_embedding extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-embedding-pg-build
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is our extension, support stopped in favor of pgvector
|
||||
# TODO: deprecate it
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PG_EMBEDDING_VERSION=0.3.5 \
|
||||
@@ -799,18 +824,20 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-anon-pg-build
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is an experimental extension, never got to real production.
|
||||
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -819,8 +846,9 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
# This layer is used to build `pgrx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rust-extensions-build
|
||||
FROM build-deps AS rust-extensions-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
|
||||
@@ -828,7 +856,7 @@ RUN apt update && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -855,8 +883,9 @@ USER root
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rust-extensions-build-pgrx12
|
||||
FROM build-deps AS rust-extensions-build-pgrx12
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
|
||||
@@ -864,7 +893,7 @@ RUN apt update && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -872,7 +901,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.12.9 cargo-pgrx && \
|
||||
cargo install --locked --version 0.12.6 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
@@ -909,19 +938,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
|
||||
\
|
||||
cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
|
||||
\
|
||||
cd ../rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
|
||||
\
|
||||
cd ../rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
@@ -946,8 +975,7 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
|
||||
# against postgres forks that decided to change their ABI name (like us).
|
||||
# With that we can build extensions without forking them and using stock
|
||||
# pgx. As this feature is new few manual version bumps were required.
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
@@ -965,8 +993,7 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
@@ -987,58 +1014,33 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
|
||||
# TODO update pgrx version in the pg_tiktoken repo and remove this line
|
||||
sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-build"
|
||||
# Compile "pgx_ulid" extension for v16 and below
|
||||
# Compile "pgx_ulid" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-pgx-ulid-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
# doesn't support v17 yet
|
||||
# https://github.com/pksunkara/pgx_ulid/pull/52
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-pgrx12-build"
|
||||
# Compile "pgx_ulid" extension for v17 and up
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1056,11 +1058,7 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
|
||||
#########################################################################################
|
||||
@@ -1070,11 +1068,13 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM pg-build AS wal2json-pg-build
|
||||
FROM build-deps AS wal2json-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# wal2json wal2json_2_6 supports v17
|
||||
# last release wal2json_2_6 - Apr 25, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
|
||||
echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1087,11 +1087,13 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
|
||||
# compile pg_ivm extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-ivm-build
|
||||
FROM build-deps AS pg-ivm-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# pg_ivm v1.9 supports v17
|
||||
# last release v1.9 - Jul 31
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1105,11 +1107,13 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
|
||||
# compile pg_partman extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-partman-build
|
||||
FROM build-deps AS pg-partman-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# should support v17 https://github.com/pgpartman/pg_partman/discussions/693
|
||||
# last release 5.1.0 Apr 2, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1125,6 +1129,9 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
|
||||
#########################################################################################
|
||||
FROM rust-extensions-build AS pg-mooncake-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
|
||||
echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
|
||||
@@ -1140,8 +1147,11 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM pg-build AS pg-repack-build
|
||||
FROM build-deps AS pg-repack-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
|
||||
echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
|
||||
@@ -1183,7 +1193,6 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1265,11 +1274,11 @@ RUN set -e \
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
|
||||
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -51,7 +51,6 @@ tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
prometheus.workspace = true
|
||||
|
||||
postgres_initdb.workspace = true
|
||||
|
||||
@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Parser;
|
||||
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
|
||||
use nix::unistd::Pid;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use tracing::{info, info_span, warn, Instrument};
|
||||
use utils::fs_ext::is_directory_empty;
|
||||
|
||||
#[path = "fast_import/aws_s3_sync.rs"]
|
||||
@@ -41,19 +41,12 @@ mod child_stdio_to_log;
|
||||
#[path = "fast_import/s3_uri.rs"]
|
||||
mod s3_uri;
|
||||
|
||||
const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
|
||||
const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
#[clap(long)]
|
||||
working_directory: Utf8PathBuf,
|
||||
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
|
||||
s3_prefix: Option<s3_uri::S3Uri>,
|
||||
#[clap(long)]
|
||||
source_connection_string: Option<String>,
|
||||
#[clap(short, long)]
|
||||
interactive: bool,
|
||||
s3_prefix: s3_uri::S3Uri,
|
||||
#[clap(long)]
|
||||
pg_bin_dir: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
@@ -84,70 +77,30 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
|
||||
info!("starting");
|
||||
|
||||
let args = Args::parse();
|
||||
let Args {
|
||||
working_directory,
|
||||
s3_prefix,
|
||||
pg_bin_dir,
|
||||
pg_lib_dir,
|
||||
} = Args::parse();
|
||||
|
||||
// Validate arguments
|
||||
if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
|
||||
anyhow::bail!("either s3_prefix or source_connection_string must be specified");
|
||||
}
|
||||
if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
|
||||
anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
|
||||
}
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
|
||||
let working_directory = args.working_directory;
|
||||
let pg_bin_dir = args.pg_bin_dir;
|
||||
let pg_lib_dir = args.pg_lib_dir;
|
||||
|
||||
// Initialize AWS clients only if s3_prefix is specified
|
||||
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
|
||||
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms = aws_sdk_kms::Client::new(&config);
|
||||
(Some(config), Some(kms))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
// Get source connection string either from S3 spec or direct argument
|
||||
let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
};
|
||||
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.unwrap()
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
} else {
|
||||
args.source_connection_string.unwrap()
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(&aws_config);
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
};
|
||||
|
||||
match tokio::fs::create_dir(&working_directory).await {
|
||||
@@ -170,6 +123,15 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.await
|
||||
.context("create pgdata directory")?;
|
||||
|
||||
//
|
||||
// Setup clients
|
||||
//
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms_client = aws_sdk_kms::Client::new(&aws_config);
|
||||
|
||||
//
|
||||
// Initialize pgdata
|
||||
//
|
||||
let pgbin = pg_bin_dir.join("postgres");
|
||||
let pg_version = match get_pg_version(pgbin.as_ref()) {
|
||||
PostgresMajorVersion::V14 => 14,
|
||||
@@ -208,13 +170,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.args(["-c", &format!("max_parallel_workers={nproc}")])
|
||||
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
|
||||
.args(["-c", &format!("max_worker_processes={nproc}")])
|
||||
.args([
|
||||
"-c",
|
||||
&format!(
|
||||
"effective_io_concurrency={}",
|
||||
if cfg!(target_os = "macos") { 0 } else { 100 }
|
||||
),
|
||||
])
|
||||
.args(["-c", "effective_io_concurrency=100"])
|
||||
.env_clear()
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
@@ -229,58 +185,44 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.instrument(info_span!("postgres")),
|
||||
);
|
||||
|
||||
// Create neondb database in the running postgres
|
||||
let restore_pg_connstring =
|
||||
format!("host=localhost port=5432 user={superuser} dbname=postgres");
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
if start_time.elapsed() > PG_WAIT_TIMEOUT {
|
||||
error!(
|
||||
"timeout exceeded: failed to poll postgres and create database within 10 minutes"
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
|
||||
Ok((client, connection)) => {
|
||||
// Spawn the connection handling task to maintain the connection
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
warn!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
match client.simple_query("CREATE DATABASE neondb;").await {
|
||||
Ok(_) => {
|
||||
info!("created neondb database");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to create database: {}, retying in {}s",
|
||||
e,
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
info!(
|
||||
"postgres not ready yet, retrying in {}s",
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
|
||||
if res.is_ok() {
|
||||
info!("postgres is ready, could connect to it");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
|
||||
//
|
||||
// Decrypt connection string
|
||||
//
|
||||
let source_connection_string = {
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Start the work
|
||||
//
|
||||
|
||||
let dumpdir = working_directory.join("dumpdir");
|
||||
|
||||
@@ -368,12 +310,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// If interactive mode, wait for Ctrl+C
|
||||
if args.interactive {
|
||||
info!("Running in interactive mode. Press Ctrl+C to shut down.");
|
||||
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
|
||||
}
|
||||
|
||||
info!("shutdown postgres");
|
||||
{
|
||||
nix::sys::signal::kill(
|
||||
@@ -389,24 +325,21 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.context("wait for postgres to shut down")?;
|
||||
}
|
||||
|
||||
// Only sync if s3_prefix was specified
|
||||
if let Some(s3_prefix) = args.s3_prefix {
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync dump directory to destination")?;
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync dump directory to destination")?;
|
||||
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
}
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -17,8 +17,7 @@ use crate::{
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct ExtensionServerParams {
|
||||
#[serde(default)]
|
||||
is_library: bool,
|
||||
is_library: Option<bool>,
|
||||
}
|
||||
|
||||
/// Download a remote extension.
|
||||
@@ -52,7 +51,7 @@ pub(in crate::http) async fn download_extension(
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
params.is_library,
|
||||
params.is_library.unwrap_or(false),
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
)
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
use std::{
|
||||
net::{IpAddr, Ipv6Addr, SocketAddr},
|
||||
sync::Arc,
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use axum::{
|
||||
extract::Request,
|
||||
middleware::{self, Next},
|
||||
response::{IntoResponse, Response},
|
||||
routing::{get, post},
|
||||
Router,
|
||||
@@ -16,9 +17,11 @@ use axum::{
|
||||
use http::StatusCode;
|
||||
use tokio::net::TcpListener;
|
||||
use tower::ServiceBuilder;
|
||||
use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
|
||||
use tower_http::{
|
||||
request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
|
||||
trace::TraceLayer,
|
||||
};
|
||||
use tracing::{debug, error, info, Span};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
@@ -31,24 +34,30 @@ async fn handle_404() -> Response {
|
||||
StatusCode::NOT_FOUND.into_response()
|
||||
}
|
||||
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
#[derive(Clone, Default)]
|
||||
struct ComputeMakeRequestId(Arc<AtomicU64>);
|
||||
|
||||
/// This middleware function allows compute_ctl to generate its own request ID
|
||||
/// if one isn't supplied. The control plane will always send one as a UUID. The
|
||||
/// neon Postgres extension on the other hand does not send one.
|
||||
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
|
||||
let headers = request.headers_mut();
|
||||
impl MakeRequestId for ComputeMakeRequestId {
|
||||
fn make_request_id<B>(
|
||||
&mut self,
|
||||
_request: &http::Request<B>,
|
||||
) -> Option<tower_http::request_id::RequestId> {
|
||||
let request_id = self
|
||||
.0
|
||||
.fetch_add(1, Ordering::SeqCst)
|
||||
.to_string()
|
||||
.parse()
|
||||
.unwrap();
|
||||
|
||||
if headers.get(X_REQUEST_ID).is_none() {
|
||||
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
|
||||
Some(RequestId::new(request_id))
|
||||
}
|
||||
|
||||
next.run(request).await
|
||||
}
|
||||
|
||||
/// Run the HTTP server and wait on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
let mut app = Router::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
@@ -73,8 +82,9 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
.fallback(handle_404)
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
.layer(SetRequestIdLayer::x_request_id(
|
||||
ComputeMakeRequestId::default(),
|
||||
))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
|
||||
@@ -1,17 +1,12 @@
|
||||
use futures::StreamExt;
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
time::Duration,
|
||||
};
|
||||
use std::{str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
|
||||
TenantPolicyRequest,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -158,12 +153,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
TenantSetPreferredAz {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
preferred_az: Option<String>,
|
||||
},
|
||||
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
|
||||
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
|
||||
TenantDrop {
|
||||
@@ -232,13 +221,6 @@ enum Command {
|
||||
},
|
||||
/// List safekeepers known to the storage controller
|
||||
Safekeepers {},
|
||||
/// Set the scheduling policy of the specified safekeeper
|
||||
SafekeeperScheduling {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
scheduling_policy: SkSchedulingPolicyArg,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -291,17 +273,6 @@ impl FromStr for PlacementPolicyArg {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct SkSchedulingPolicyArg(SkSchedulingPolicy);
|
||||
|
||||
impl FromStr for SkSchedulingPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
SkSchedulingPolicy::from_str(s).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||
|
||||
@@ -431,12 +402,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
|
||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
table.add_row([
|
||||
format!("{}", node.id),
|
||||
node.listen_http_addr,
|
||||
node.availability_zone_id,
|
||||
format!("{:?}", node.scheduling),
|
||||
format!("{:?}", node.availability),
|
||||
]);
|
||||
@@ -496,65 +466,33 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("{table}");
|
||||
}
|
||||
Command::Tenants { node_id: None } => {
|
||||
// Set up output formatting
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
"Preferred AZ",
|
||||
"ShardCount",
|
||||
"StripeSize",
|
||||
"Placement",
|
||||
"Scheduling",
|
||||
]);
|
||||
|
||||
// Pagination loop over listing API
|
||||
let mut start_after = None;
|
||||
const LIMIT: usize = 1000;
|
||||
loop {
|
||||
let path = match start_after {
|
||||
None => format!("control/v1/tenant?limit={LIMIT}"),
|
||||
Some(start_after) => {
|
||||
format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}")
|
||||
}
|
||||
};
|
||||
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(Method::GET, path, None)
|
||||
.await?;
|
||||
|
||||
if resp.is_empty() {
|
||||
// End of data reached
|
||||
break;
|
||||
}
|
||||
|
||||
// Give some visual feedback while we're building up the table (comfy_table doesn't have
|
||||
// streaming output)
|
||||
if resp.len() >= LIMIT {
|
||||
eprint!(".");
|
||||
}
|
||||
|
||||
start_after = Some(resp.last().unwrap().tenant_id);
|
||||
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
shard_zero
|
||||
.preferred_az_id
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.unwrap_or("".to_string()),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
// Terminate progress dots
|
||||
if table.row_count() > LIMIT {
|
||||
eprint!("");
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
}
|
||||
|
||||
println!("{table}");
|
||||
@@ -676,19 +614,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let nodes = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let nodes = nodes
|
||||
.into_iter()
|
||||
.map(|n| (n.id, n))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
println!("Tenant {tenant_id}");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.add_row(["Policy", &format!("{:?}", policy)]);
|
||||
@@ -697,14 +622,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("{table}");
|
||||
println!("Shards:");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"Shard",
|
||||
"Attached",
|
||||
"Attached AZ",
|
||||
"Secondary",
|
||||
"Last error",
|
||||
"status",
|
||||
]);
|
||||
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||
for shard in shards {
|
||||
let secondary = shard
|
||||
.node_secondary
|
||||
@@ -727,18 +645,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
let status = status_parts.join(",");
|
||||
|
||||
let attached_node = shard
|
||||
.node_attached
|
||||
.as_ref()
|
||||
.map(|id| nodes.get(id).expect("Shard references nonexistent node"));
|
||||
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
attached_node
|
||||
.map(|n| format!("{} ({})", n.listen_http_addr, n.id))
|
||||
.unwrap_or(String::new()),
|
||||
attached_node
|
||||
.map(|n| n.availability_zone_id.clone())
|
||||
shard
|
||||
.node_attached
|
||||
.map(|n| format!("{}", n))
|
||||
.unwrap_or(String::new()),
|
||||
secondary,
|
||||
shard.last_error,
|
||||
@@ -747,66 +658,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantSetPreferredAz {
|
||||
tenant_id,
|
||||
preferred_az,
|
||||
} => {
|
||||
// First learn about the tenant's shards
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Learn about nodes to validate the AZ ID
|
||||
let nodes = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(preferred_az) = &preferred_az {
|
||||
let azs = nodes
|
||||
.into_iter()
|
||||
.map(|n| (n.availability_zone_id))
|
||||
.collect::<HashSet<_>>();
|
||||
if !azs.contains(preferred_az) {
|
||||
anyhow::bail!(
|
||||
"AZ {} not found on any node: known AZs are: {:?}",
|
||||
preferred_az,
|
||||
azs
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Make it obvious to the user that since they've omitted an AZ, we're clearing it
|
||||
eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
|
||||
}
|
||||
|
||||
// Construct a request that modifies all the tenant's shards
|
||||
let req = ShardsPreferredAzsRequest {
|
||||
preferred_az_ids: describe_response
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|s| {
|
||||
(
|
||||
s.tenant_shard_id,
|
||||
preferred_az.clone().map(AvailabilityZone),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<ShardsPreferredAzsRequest, ()>(
|
||||
Method::PUT,
|
||||
"control/v1/preferred_azs".to_string(),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantWarmup { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
@@ -1221,23 +1072,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::SafekeeperScheduling {
|
||||
node_id,
|
||||
scheduling_policy,
|
||||
} => {
|
||||
let scheduling_policy = scheduling_policy.0;
|
||||
storcon_client
|
||||
.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
|
||||
Method::POST,
|
||||
format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
|
||||
Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Scheduling policy of {node_id} set to {}",
|
||||
String::from(scheduling_policy)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it
|
||||
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
|
||||
response it sends its current configuration generation to let walproposer know.
|
||||
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
|
||||
current one and ignores it otherwise. In any case it replies with
|
||||
```
|
||||
@@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not
|
||||
define consensus members. Instead, on start walproposer tracks highest
|
||||
configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
|
||||
from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
|
||||
establishes this configuration as its own and moves to voting.
|
||||
establishes this configuration as its own and moves to voting.
|
||||
|
||||
It should stop talking to safekeepers not listed in the configuration at this
|
||||
point, though it is not unsafe to continue doing so.
|
||||
@@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
|
||||
The following algorithm can be executed anywhere having access to configuration
|
||||
storage and safekeepers. It is safe to interrupt / restart it and run multiple
|
||||
instances of it concurrently, though likely one of them won't make
|
||||
progress then. It accepts `desired_set: Vec<NodeId>` as input.
|
||||
progress then. It accepts `desired_set: Vec<NodeId>` as input.
|
||||
|
||||
Algorithm will refuse to make the change if it encounters previous interrupted
|
||||
change attempt, but in this case it will try to finish it.
|
||||
@@ -140,7 +140,7 @@ storage are reachable.
|
||||
safe. Failed CAS aborts the procedure.
|
||||
4) Call `PUT` `configuration` on safekeepers from the current set,
|
||||
delivering them `joint_conf`. Collecting responses from majority is required
|
||||
to proceed. If any response returned generation higher than
|
||||
to proceed. If any response returned generation higher than
|
||||
`joint_conf.generation`, abort (another switch raced us). Otherwise, choose
|
||||
max `<last_log_term, flush_lsn>` among responses and establish it as
|
||||
(in memory) `sync_position`. Also choose max `term` and establish it as (in
|
||||
@@ -149,49 +149,49 @@ storage are reachable.
|
||||
without ack from the new set. Similarly, we'll bump term on new majority
|
||||
to `sync_term` so that two computes with the same term are never elected.
|
||||
4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
|
||||
doesn't exist yet by doing `pull_timeline` from the majority of the
|
||||
doesn't exist yet by doing `pull_timeline` from the majority of the
|
||||
current set. Doing that on majority of `new_sk_set` is enough to
|
||||
proceed, but it is reasonable to ensure that all `new_sk_set` members
|
||||
are initialized -- if some of them are down why are we migrating there?
|
||||
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
|
||||
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
|
||||
Success on majority is enough.
|
||||
6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
|
||||
delivering them `joint_conf` and collecting their positions. This will
|
||||
switch them to the `joint_conf` which generally won't be needed
|
||||
switch them to the `joint_conf` which generally won't be needed
|
||||
because `pull_timeline` already includes it and plus additionally would be
|
||||
broadcast by compute. More importantly, we may proceed to the next step
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
`pull_timeline` already includes it. However, we should double
|
||||
check to be safe. For example, timeline could have been created earlier e.g.
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
storage under one more CAS.
|
||||
8) Call `PUT` `configuration` on safekeepers from the new set,
|
||||
delivering them `new_conf`. It is enough to deliver it to the majority
|
||||
delivering them `new_conf`. It is enough to deliver it to the majority
|
||||
of the new set; the rest can be updated by compute.
|
||||
|
||||
I haven't put huge effort to make the description above very precise, because it
|
||||
is natural language prone to interpretations anyway. Instead I'd like to make TLA+
|
||||
spec of it.
|
||||
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
considerations.
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
step 3.
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
it is safe to rollback to the old conf with one more CAS.
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
the simplest is the procedure restart. There are more complicated scenarious like mentioned
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
|
||||
the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
|
||||
I don't think we'll observe this in practice, but can add waking up compute if needed.
|
||||
4) In the end timeline should be locally deleted on the safekeeper(s) which are
|
||||
in the old set but not in the new one, unless they are unreachable. To be
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
current configuration is <= than one in request and safekeeper is not memeber of it).
|
||||
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
|
||||
jump to step 7, using it as `new_conf`.
|
||||
@@ -202,87 +202,47 @@ The procedure ought to be driven from somewhere. Obvious candidates are control
|
||||
plane and storage_controller; and as each of them already has db we don't want
|
||||
yet another storage. I propose to manage safekeepers in storage_controller
|
||||
because 1) since it is in rust it simplifies simulation testing (more on this
|
||||
below) 2) it already manages pageservers.
|
||||
below) 2) it already manages pageservers.
|
||||
|
||||
This assumes that migration will be fully usable only after we migrate all
|
||||
tenants/timelines to storage_controller. It is discussible whether we want also
|
||||
to manage pageserver attachments for all of these, but likely we do.
|
||||
|
||||
This requires us to define storcon <-> cplane interface and changes.
|
||||
This requires us to define storcon <-> cplane interface.
|
||||
|
||||
### storage_controller <-> control plane interface and changes
|
||||
### storage_controller <-> control plane interface
|
||||
|
||||
First of all, control plane should
|
||||
[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
|
||||
storing safekeepers per timeline instead of per tenant because we can't migrate
|
||||
tenants atomically.
|
||||
tenants atomically.
|
||||
|
||||
The important question is how updated configuration is delivered from
|
||||
storage_controller to control plane to provide it to computes. As always, there
|
||||
are two options, pull and push. Let's do it the same push as with pageserver
|
||||
`/notify-attach` because 1) it keeps storage_controller out of critical compute
|
||||
start path 2) uniformity. It makes storage_controller responsible for retrying
|
||||
notifying control plane until it succeeds.
|
||||
start path 2) provides easier upgrade: there won't be such a thing as 'timeline
|
||||
managed by control plane / storcon', cplane just takes the value out of its db
|
||||
when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
|
||||
control plane until it succeeds.
|
||||
|
||||
It is not needed for the control plane to fully know the `Configuration`. It is
|
||||
enough for it to only to be aware of the list of safekeepers in the latest
|
||||
configuration to supply it to compute, plus associated generation number to
|
||||
protect from stale update requests and to also pass it to compute.
|
||||
|
||||
So, cplane `/notify-safekeepers` for the timeline can accept JSON like
|
||||
```
|
||||
{
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
generation: u32,
|
||||
safekeepers: Vec<SafekeeperId>,
|
||||
}
|
||||
```
|
||||
where `SafekeeperId` is
|
||||
```
|
||||
{
|
||||
node_id: u64,
|
||||
host: String
|
||||
}
|
||||
```
|
||||
In principle `host` is redundant, but may be useful for observability.
|
||||
|
||||
The request updates list of safekeepers in the db if the provided conf
|
||||
generation is higher (the cplane db should also store generations for this).
|
||||
Similarly to
|
||||
[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365),
|
||||
it should update db which makes the call successful, and then try to schedule
|
||||
`apply_config` if possible, it is ok if not. storage_controller should rate
|
||||
limit calling the endpoint, but likely this won't be needed, as migration
|
||||
So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
|
||||
updates it in the db if the provided conf generation is higher (the cplane db
|
||||
should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
|
||||
should update db which makes the call successful, and then try to schedule
|
||||
`apply_config` if possible, it is ok if not. storage_controller
|
||||
should rate limit calling the endpoint, but likely this won't be needed, as migration
|
||||
throughput is limited by `pull_timeline`.
|
||||
|
||||
Timeline (branch) creation in cplane should call storage_controller POST
|
||||
`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
|
||||
Response should be augmented with `safekeepers_generation` and `safekeepers`
|
||||
fields like described in `/notify-safekeepers` above. Initially (currently)
|
||||
these fields may be absent; in this case cplane chooses safekeepers on its own
|
||||
like it currently does. The call should be retried until succeeds.
|
||||
Response should be augmented with `safekeeper_conf: Configuration`. The call
|
||||
should be retried until succeeds.
|
||||
|
||||
Timeline deletion and tenant deletion in cplane should call appropriate
|
||||
storage_controller endpoints like it currently does for sharded tenants. The
|
||||
calls should be retried until they succeed.
|
||||
|
||||
When compute receives safekeepers list from control plane it needs to know the
|
||||
generation to checked whether it should be updated (note that compute may get
|
||||
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
|
||||
GUC is just a comma separates list of `host:port`. Let's prefix it with
|
||||
`g#<generation>:` to this end, so it will look like
|
||||
```
|
||||
g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
|
||||
```
|
||||
|
||||
To summarize, list of cplane changes:
|
||||
- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field.
|
||||
- `/notify-safekeepers` endpoint.
|
||||
- Branch creation call may return list of safekeepers and when it is
|
||||
present cplane should adopt it instead of choosing on its own like it does currently.
|
||||
- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
|
||||
|
||||
### storage_controller implementation
|
||||
|
||||
Current 'load everything on startup and keep in memory' easy design is fine.
|
||||
@@ -400,10 +360,10 @@ source safekeeper might fail, which is not a problem if we are going to
|
||||
decomission the node but leaves garbage otherwise. I'd propose in the first version
|
||||
1) Don't attempt deletion at all if node status is `offline`.
|
||||
2) If it failed, just issue warning.
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
be deleted under generation number if node is not member of current generation.
|
||||
|
||||
Automating this is untrivial; we'd need to register all potential missing
|
||||
@@ -452,8 +412,8 @@ There should be following layers of tests:
|
||||
3) Since simulation testing injects at relatively high level points (not
|
||||
syscalls), it omits some code, in particular `pull_timeline`. Thus it is
|
||||
better to have basic tests covering whole system as well. Extended version of
|
||||
`test_restarts_under_load` would do: start background load and do migration
|
||||
under it, then restart endpoint and check that no reported commits
|
||||
`test_restarts_under_load` would do: start background load and do migration
|
||||
under it, then restart endpoint and check that no reported commits
|
||||
had been lost. I'd also add one more creating classic network split scenario, with
|
||||
one compute talking to AC and another to BD while migration from nodes ABC to ABD
|
||||
happens.
|
||||
@@ -462,51 +422,35 @@ There should be following layers of tests:
|
||||
|
||||
## Order of implementation and rollout
|
||||
|
||||
Note that
|
||||
Note that
|
||||
- Control plane parts and integration with it is fully independent from everything else
|
||||
(tests would use simulation and neon_local).
|
||||
- It is reasonable to make compute <-> safekeepers protocol change
|
||||
independent of enabling generations.
|
||||
- There is a lot of infra work making storage_controller aware of timelines and safekeepers
|
||||
and its impl/rollout should be separate from migration itself.
|
||||
- Initially walproposer can just stop working while it observes joint configuration.
|
||||
- Initially walproposer can just stop working while it observers joint configuration.
|
||||
Such window would be typically very short anyway.
|
||||
- Obviously we want to test the whole thing thoroughly on staging and only then
|
||||
gradually enable in prod.
|
||||
|
||||
Let's have the following implementation bits for gradual rollout:
|
||||
- compute gets `neon.safekeepers_proto_version` flag.
|
||||
Initially both compute and safekeepers will be able to talk both
|
||||
versions so that we can delay force restart of them and for
|
||||
simplicity of rollback in case it is needed.
|
||||
- storcon gets `-set-safekeepers` config option disabled by
|
||||
default. Timeline creation request chooses safekeepers
|
||||
(and returns them in response to cplane) only when it is set to
|
||||
true.
|
||||
- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
|
||||
prefixes `neon.safekeepers` GUC with generation number. When it is 0
|
||||
(or prefix not present at all), walproposer behaves as currently, committing on
|
||||
the provided safekeeper list -- generations are disabled.
|
||||
If it is non 0 it follows this RFC rules.
|
||||
- We provide a script for manual migration to storage controller.
|
||||
It selects timeline(s) from control plane (specified or all of them) db
|
||||
and calls special import endpoint on storage controller which is very
|
||||
similar to timeline creation: it inserts into the db, sets
|
||||
configuration to initial on the safekeepers, calls cplane
|
||||
`notify-safekeepers`.
|
||||
To rollout smoothly, both walproposer and safekeeper should have flag
|
||||
`configurations_enabled`; when set to false, they would work as currently, i.e.
|
||||
walproposer is able to commit on whatever safekeeper set it is provided. Until
|
||||
all timelines are managed by storcon we'd need to use current script to migrate
|
||||
and update/drop entries in the storage_controller database if it has any.
|
||||
|
||||
Then the rollout for a region would be:
|
||||
- Current situation: safekeepers are choosen by control_plane.
|
||||
- We manually migrate some timelines, test moving them around.
|
||||
- Then we enable `--set-safekeepers` so that all new timelines
|
||||
are on storage controller.
|
||||
- Finally migrate all existing timelines using the script (no
|
||||
compute should be speaking old proto version at this point).
|
||||
Safekeepers would need to be able to talk both current and new protocol version
|
||||
with compute to reduce number of computes restarted in prod once v2 protocol is
|
||||
deployed (though before completely switching we'd need to force this).
|
||||
|
||||
Let's have the following rollout order:
|
||||
- storage_controller becomes aware of safekeepers;
|
||||
- storage_controller gets timeline creation for new timelines and deletion requests, but
|
||||
doesn't manage all timelines yet. Migration can be tested on these new timelines.
|
||||
To keep control plane and storage_controller databases in sync while control
|
||||
plane still chooses the safekeepers initially (until all timelines are imported
|
||||
it can choose better), `TimelineCreateRequest` can get optional safekeepers
|
||||
field with safekeepers chosen by cplane.
|
||||
- Then we can import all existing timelines from control plane to
|
||||
storage_controller and gradually enable configurations region by region.
|
||||
|
||||
Until all timelines are managed by storcon we'd need to use current ad hoc
|
||||
script to migrate if needed. To keep state clean, all storage controller managed
|
||||
timelines must be migrated before that, or controller db and configurations
|
||||
state of safekeepers dropped manually.
|
||||
|
||||
Very rough implementation order:
|
||||
- Add concept of configurations to safekeepers (including control file),
|
||||
@@ -514,10 +458,10 @@ Very rough implementation order:
|
||||
- Implement walproposer changes, including protocol.
|
||||
- Implement storconn part. Use it in neon_local (and pytest).
|
||||
- Make cplane store safekeepers per timeline instead of per tenant.
|
||||
- Implement cplane/storcon integration. Route branch creation/deletion
|
||||
- Implement cplane/storcon integration. Route branch creation/deletion
|
||||
through storcon. Then we can test migration of new branches.
|
||||
- Finally import existing branches. Then we can drop cplane
|
||||
safekeeper selection code. Gradually enable configurations at
|
||||
- Finally import existing branches. Then we can drop cplane
|
||||
safekeeper selection code. Gradually enable configurations at
|
||||
computes and safekeepers. Before that, all computes must talk only
|
||||
v3 protocol version.
|
||||
|
||||
|
||||
@@ -1,247 +0,0 @@
|
||||
# CPU and Memory Profiling
|
||||
|
||||
Created 2025-01-12 by Erik Grinaker.
|
||||
|
||||
See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4).
|
||||
|
||||
## Summary
|
||||
|
||||
This document proposes a standard cross-team pattern for CPU and memory profiling across
|
||||
applications and languages, using the [pprof](https://github.com/google/pprof) profile format.
|
||||
|
||||
It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via
|
||||
[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
|
||||
Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations.
|
||||
|
||||
## Motivation
|
||||
|
||||
CPU and memory profiles are crucial observability tools for understanding performance issues,
|
||||
resource exhaustion, and resource costs. They allow answering questions like:
|
||||
|
||||
* Why is this process using 100% CPU?
|
||||
* How do I make this go faster?
|
||||
* Why did this process run out of memory?
|
||||
* Why are we paying for all these CPU cores and memory chips?
|
||||
|
||||
Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its
|
||||
standard library, using the [pprof](https://github.com/google/pprof) profile format and associated
|
||||
tooling.
|
||||
|
||||
This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires
|
||||
installing and running additional tools like `perf` as root on production nodes, with analysis tools
|
||||
that can be hard to use and often don't give good results. This is not only annoying, but can also
|
||||
significantly affect the resolution time of production incidents.
|
||||
|
||||
This proposal will:
|
||||
|
||||
* Provide CPU and heap profiles in pprof format via HTTP API.
|
||||
* Record continuous profiles in Grafana for aggregate historical analysis.
|
||||
* Make it easy for anyone to see a flamegraph in less than one minute.
|
||||
* Be reasonably consistent across teams and services (Rust, Go, C).
|
||||
|
||||
## Non Goals (For Now)
|
||||
|
||||
* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/)
|
||||
like mutexes, locks, goroutines, etc.
|
||||
* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/).
|
||||
* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization).
|
||||
|
||||
## Using Profiles
|
||||
|
||||
Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services:
|
||||
|
||||
```
|
||||
$ curl localhost:9898/profile/cpu >profile.pb.gz
|
||||
```
|
||||
|
||||
pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which
|
||||
provides flamegraphs, call graphs, plain text listings, and more:
|
||||
|
||||
```
|
||||
$ pprof -http :6060 <profile>
|
||||
```
|
||||
|
||||
Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly:
|
||||
|
||||
```
|
||||
$ curl localhost:9898/profile/cpu?format=svg >profile.svg
|
||||
$ open profile.svg
|
||||
```
|
||||
|
||||
Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles
|
||||
(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)).
|
||||
|
||||
## API Requirements
|
||||
|
||||
* HTTP endpoints that return a profile in pprof format (with symbols).
|
||||
* CPU: records a profile over the request time interval (`seconds` query parameter).
|
||||
* Memory: returns the current in-use heap allocations.
|
||||
* Unauthenticated, as it should not expose user data or pose a denial-of-service risk.
|
||||
* Default sample frequency should not impact service (maximum 5% CPU overhead).
|
||||
* Linux-compatibility.
|
||||
|
||||
Nice to have:
|
||||
|
||||
* Return flamegraph SVG directly from the HTTP endpoint if requested.
|
||||
* Configurable sample frequency for CPU profiles.
|
||||
* Historical heap allocations, by count and bytes.
|
||||
* macOS-compatiblity.
|
||||
|
||||
## Rust Profiling
|
||||
|
||||
[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs)
|
||||
contains ready-to-use HTTP endpoints for CPU and memory profiling:
|
||||
[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
|
||||
|
||||
### CPU
|
||||
|
||||
CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via
|
||||
[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338).
|
||||
Expose it unauthenticated at `/profile/cpu`.
|
||||
|
||||
Parameters:
|
||||
|
||||
* `format`: profile output format (`pprof` or `svg`; default `pprof`).
|
||||
* `seconds`: duration to collect profile over, in seconds (default `5`).
|
||||
* `frequency`: how often to sample thread stacks, in Hz (default `99`).
|
||||
* `force`: if `true`, cancel a running profile and start a new one (default `false`).
|
||||
|
||||
Works on Linux and macOS.
|
||||
|
||||
### Memory
|
||||
|
||||
Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator),
|
||||
and enable profiling with samples every 2 MB allocated:
|
||||
|
||||
```rust
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
```
|
||||
|
||||
pprof profiles are generated by
|
||||
[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via
|
||||
[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
|
||||
Expose it unauthenticated at `/profile/heap`.
|
||||
|
||||
Parameters:
|
||||
|
||||
* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`).
|
||||
|
||||
Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26).
|
||||
|
||||
## Go Profiling
|
||||
|
||||
The Go standard library includes pprof profiling via HTTP API in
|
||||
[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at
|
||||
`/debug/pprof`.
|
||||
|
||||
Works on Linux and macOS.
|
||||
|
||||
### CPU
|
||||
|
||||
Via `/debug/pprof/profile`. Parameters:
|
||||
|
||||
* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`).
|
||||
* `seconds`: duration to collect profile over, in seconds (default `30`).
|
||||
|
||||
Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)),
|
||||
and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default
|
||||
is likely ok (estimated 1% overhead).
|
||||
|
||||
### Memory
|
||||
|
||||
Via `/debug/pprof/heap`. Parameters:
|
||||
|
||||
* `seconds`: take a delta profile over the given duration, in seconds (default `0`).
|
||||
* `gc`: if `1`, garbage collect before taking profile.
|
||||
|
||||
## C Profiling
|
||||
|
||||
[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling
|
||||
with pprof output.
|
||||
|
||||
However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value
|
||||
since we don't own the internals anyway.
|
||||
|
||||
Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient,
|
||||
so this is not a priority at the moment.
|
||||
|
||||
## Grafana Continuous Profiling
|
||||
|
||||
[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles
|
||||
across the fleet, and archives them as time series. This can be used to analyze resource usage over
|
||||
time, either in aggregate or zoomed in to specific events and nodes.
|
||||
|
||||
Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals
|
||||
is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB).
|
||||
|
||||
It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)
|
||||
for Pageserver and Safekeeper.
|
||||
|
||||
### Scraping
|
||||
|
||||
* CPU profiling: 59 seconds at 19 Hz every 60 seconds.
|
||||
* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds.
|
||||
|
||||
There are two main approaches that can be taken for CPU profiles:
|
||||
|
||||
* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds).
|
||||
* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds).
|
||||
|
||||
We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead
|
||||
of a spiky high overhead. It likely also gives a more representative view of resource usage.
|
||||
However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the
|
||||
actual runtime of small functions. Note that Go does not support a frequency parameter, so we must
|
||||
use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz).
|
||||
|
||||
Only one CPU profile can be taken at a time. With continuous profiling, one will always be running.
|
||||
To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to
|
||||
cancel a running profile and start a new one.
|
||||
|
||||
### Overhead
|
||||
|
||||
With Rust:
|
||||
|
||||
* CPU profiles at 19 Hz frequency: 0.1% overhead.
|
||||
* Heap profiles at 2 MB frequency: 3% allocation overhead.
|
||||
* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver).
|
||||
* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver).
|
||||
|
||||
Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was
|
||||
11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw
|
||||
frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible
|
||||
overhead).
|
||||
|
||||
CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal
|
||||
after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one
|
||||
of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack
|
||||
trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but
|
||||
likely 0.1% in practice (given e.g. context switches).
|
||||
|
||||
Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the
|
||||
allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs,
|
||||
so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is
|
||||
consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the
|
||||
fact that performance-sensitive code will avoid allocations as far as possible.
|
||||
|
||||
Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for
|
||||
Pageserver.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
* eBPF profiles.
|
||||
* Don't require instrumenting the binary.
|
||||
* Use less resources.
|
||||
* Can profile in kernel space too.
|
||||
* Supported by Grafana.
|
||||
* Less information about stack frames and spans.
|
||||
* Limited tooling for local analysis.
|
||||
* Does not support heap profiles.
|
||||
* Does not work on macOS.
|
||||
|
||||
* [Polar Signals](https://www.polarsignals.com) instead of Grafana.
|
||||
* We already use Grafana for everything else. Appears good enough.
|
||||
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsRequest {
|
||||
#[serde(flatten)]
|
||||
pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
|
||||
pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -144,8 +144,6 @@ pub struct NodeDescribeResponse {
|
||||
pub availability: NodeAvailabilityWrapper,
|
||||
pub scheduling: NodeSchedulingPolicy,
|
||||
|
||||
pub availability_zone_id: String,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
@@ -324,7 +322,7 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum SkSchedulingPolicy {
|
||||
Active,
|
||||
Pause,
|
||||
Disabled,
|
||||
Decomissioned,
|
||||
}
|
||||
|
||||
@@ -334,13 +332,9 @@ impl FromStr for SkSchedulingPolicy {
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(match s {
|
||||
"active" => Self::Active,
|
||||
"pause" => Self::Pause,
|
||||
"disabled" => Self::Disabled,
|
||||
"decomissioned" => Self::Decomissioned,
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,pause,decomissioned"
|
||||
))
|
||||
}
|
||||
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -350,7 +344,7 @@ impl From<SkSchedulingPolicy> for String {
|
||||
use SkSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Pause => "pause",
|
||||
Disabled => "disabled",
|
||||
Decomissioned => "decomissioned",
|
||||
}
|
||||
.to_string()
|
||||
@@ -373,16 +367,6 @@ pub enum PlacementPolicy {
|
||||
Detached,
|
||||
}
|
||||
|
||||
impl PlacementPolicy {
|
||||
pub fn want_secondaries(&self) -> usize {
|
||||
match self {
|
||||
PlacementPolicy::Attached(secondary_count) => *secondary_count,
|
||||
PlacementPolicy::Secondary => 1,
|
||||
PlacementPolicy::Detached => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
@@ -420,6 +404,8 @@ pub struct MetadataHealthListOutdatedResponse {
|
||||
}
|
||||
|
||||
/// Publicly exposed safekeeper description
|
||||
///
|
||||
/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperDescribeResponse {
|
||||
pub id: NodeId,
|
||||
@@ -435,11 +421,6 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
@@ -24,9 +24,7 @@ pub struct Key {
|
||||
|
||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
||||
/// a struct of fields.
|
||||
#[derive(
|
||||
Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
|
||||
)]
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
|
||||
pub struct CompactKey(i128);
|
||||
|
||||
/// The storage key size.
|
||||
|
||||
@@ -29,10 +29,11 @@ use utils::{
|
||||
};
|
||||
|
||||
use crate::{
|
||||
key::{CompactKey, Key},
|
||||
key::Key,
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
@@ -1399,8 +1400,6 @@ pub enum PagestreamFeMessage {
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -1412,22 +1411,6 @@ pub enum PagestreamBeMessage {
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamFeMessageTag {
|
||||
Exists = 0,
|
||||
Nblocks = 1,
|
||||
GetPage = 2,
|
||||
DbSize = 3,
|
||||
GetSlruSegment = 4,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 99,
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
@@ -1439,28 +1422,7 @@ enum PagestreamBeMessageTag {
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 199,
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamFeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
0 => Ok(PagestreamFeMessageTag::Exists),
|
||||
1 => Ok(PagestreamFeMessageTag::Nblocks),
|
||||
2 => Ok(PagestreamFeMessageTag::GetPage),
|
||||
3 => Ok(PagestreamFeMessageTag::DbSize),
|
||||
4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
99 => Ok(PagestreamFeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
@@ -1471,8 +1433,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
199 => Ok(PagestreamBeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
@@ -1590,20 +1550,6 @@ pub struct PagestreamDbSizeResponse {
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct PagestreamTestRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub batch_key: u64,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamTestResponse {
|
||||
pub req: PagestreamTestRequest,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -1623,7 +1569,7 @@ impl PagestreamFeMessage {
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
|
||||
bytes.put_u8(0);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1634,7 +1580,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
|
||||
bytes.put_u8(1);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1645,7 +1591,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
|
||||
bytes.put_u8(2);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1657,7 +1603,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
|
||||
bytes.put_u8(3);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1665,24 +1611,13 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
|
||||
bytes.put_u8(4);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Test as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.batch_key);
|
||||
let message = req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -1710,66 +1645,56 @@ impl PagestreamFeMessage {
|
||||
),
|
||||
};
|
||||
|
||||
match PagestreamFeMessageTag::try_from(msg_tag)
|
||||
.map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
|
||||
{
|
||||
PagestreamFeMessageTag::Exists => {
|
||||
Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::Nblocks => {
|
||||
Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetPage => {
|
||||
Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::DbSize => {
|
||||
Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
@@ -1780,21 +1705,7 @@ impl PagestreamFeMessage {
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key: body.read_u64::<BigEndian>()?,
|
||||
message: {
|
||||
let len = body.read_u64::<BigEndian>()?;
|
||||
let mut buf = vec![0; len as usize];
|
||||
body.read_exact(&mut buf)?;
|
||||
String::from_utf8(buf)?
|
||||
},
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1837,15 +1748,6 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
PagestreamProtocolVersion::V3 => {
|
||||
@@ -1914,18 +1816,6 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2068,28 +1958,6 @@ impl PagestreamBeMessage {
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Tag::Test => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let batch_key = buf.read_u64::<BigEndian>()?;
|
||||
let len = buf.read_u64::<BigEndian>()?;
|
||||
let mut msg = vec![0; len as usize];
|
||||
buf.read_exact(&mut msg)?;
|
||||
let message = String::from_utf8(msg)?;
|
||||
Self::Test(PagestreamTestResponse {
|
||||
req: PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key,
|
||||
message,
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
@@ -2109,25 +1977,6 @@ impl PagestreamBeMessage {
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(_) => "Test",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PageTraceEvent {
|
||||
pub key: CompactKey,
|
||||
pub effective_lsn: Lsn,
|
||||
pub time: SystemTime,
|
||||
}
|
||||
|
||||
impl Default for PageTraceEvent {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
key: Default::default(),
|
||||
effective_lsn: Default::default(),
|
||||
time: std::time::UNIX_EPOCH,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,8 +31,6 @@
|
||||
//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
//! and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use crate::{key::Key, models::ShardParameters};
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -50,23 +48,6 @@ pub struct ShardIdentity {
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
/// Hash implementation
|
||||
///
|
||||
/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
|
||||
impl Hash for ShardIdentity {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
let ShardIdentity {
|
||||
number,
|
||||
count,
|
||||
stripe_size: _,
|
||||
layout: _,
|
||||
} = self;
|
||||
|
||||
number.0.hash(state);
|
||||
count.0.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
/// Stripe size in number of pages
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardStripeSize(pub u32);
|
||||
@@ -78,7 +59,7 @@ impl Default for ShardStripeSize {
|
||||
}
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
|
||||
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
|
||||
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactCreate {
|
||||
pub mid: MultiXactId,
|
||||
/* new MultiXact's ID */
|
||||
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
pub oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlRelmapUpdate {
|
||||
pub dbid: Oid, /* database ID, or 0 for shared map */
|
||||
pub tsid: Oid, /* database's tablespace, or pg_global */
|
||||
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginDrop {
|
||||
pub node_id: RepOriginId,
|
||||
}
|
||||
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginSet {
|
||||
pub remote_lsn: Lsn,
|
||||
pub node_id: RepOriginId,
|
||||
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlSmgrTruncate {
|
||||
pub blkno: BlockNumber,
|
||||
pub rnode: RelFileNode,
|
||||
@@ -984,7 +984,7 @@ impl XlDropDatabase {
|
||||
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
|
||||
/// struct for commits and aborts.
|
||||
///
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct XlXactParsedRecord {
|
||||
pub xid: TransactionId,
|
||||
pub info: u8,
|
||||
|
||||
@@ -43,17 +43,6 @@ impl RemoteStorageKind {
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
/// Helper to fetch the configured concurrency limit.
|
||||
pub fn concurrency_limit(&self) -> Option<usize> {
|
||||
match &self.storage {
|
||||
RemoteStorageKind::LocalFs { .. } => None,
|
||||
RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
|
||||
RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_timeout() -> Duration {
|
||||
RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
|
||||
@@ -5,12 +5,9 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
const_format.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
tokio.workspace = true
|
||||
utils.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -4,15 +4,12 @@ use const_format::formatcp;
|
||||
use pq_proto::SystemId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub mod membership;
|
||||
/// Public API types
|
||||
pub mod models;
|
||||
|
||||
/// Consensus logical timestamp. Note: it is a part of sk control file.
|
||||
pub type Term = u64;
|
||||
/// With this term timeline is created initially. It
|
||||
/// is a normal term except wp is never elected with it.
|
||||
pub const INITIAL_TERM: Term = 0;
|
||||
pub const INVALID_TERM: Term = 0;
|
||||
|
||||
/// Information about Postgres. Safekeeper gets it once and then verifies all
|
||||
/// further connections from computes match. Note: it is a part of sk control
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
//! Types defining safekeeper membership, see
|
||||
//! rfcs/035-safekeeper-dynamic-membership-change.md
|
||||
//! for details.
|
||||
|
||||
use std::{collections::HashSet, fmt::Display};
|
||||
|
||||
use anyhow;
|
||||
use anyhow::bail;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
/// Number uniquely identifying safekeeper configuration.
|
||||
/// Note: it is a part of sk control file.
|
||||
pub type Generation = u32;
|
||||
/// 1 is the first valid generation, 0 is used as
|
||||
/// a placeholder before we fully migrate to generations.
|
||||
pub const INVALID_GENERATION: Generation = 0;
|
||||
pub const INITIAL_GENERATION: Generation = 1;
|
||||
|
||||
/// Membership is defined by ids so e.g. walproposer uses them to figure out
|
||||
/// quorums, but we also carry host and port to give wp idea where to connect.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct SafekeeperId {
|
||||
pub id: NodeId,
|
||||
pub host: String,
|
||||
/// We include here only port for computes -- that is, pg protocol tenant
|
||||
/// only port, or wide pg protocol port if the former is not configured.
|
||||
pub pg_port: u16,
|
||||
}
|
||||
|
||||
impl Display for SafekeeperId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
|
||||
}
|
||||
}
|
||||
|
||||
/// Set of safekeepers.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(transparent)]
|
||||
pub struct MemberSet {
|
||||
pub members: Vec<SafekeeperId>,
|
||||
}
|
||||
|
||||
impl MemberSet {
|
||||
pub fn empty() -> Self {
|
||||
MemberSet {
|
||||
members: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
|
||||
let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
|
||||
if hs.len() != members.len() {
|
||||
bail!("duplicate safekeeper id in the set {:?}", members);
|
||||
}
|
||||
Ok(MemberSet { members })
|
||||
}
|
||||
|
||||
pub fn contains(&self, sk: &SafekeeperId) -> bool {
|
||||
self.members.iter().any(|m| m.id == sk.id)
|
||||
}
|
||||
|
||||
pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
|
||||
if self.contains(&sk) {
|
||||
bail!(format!(
|
||||
"sk {} is already member of the set {}",
|
||||
sk.id, self
|
||||
));
|
||||
}
|
||||
self.members.push(sk);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MemberSet {
|
||||
/// Display as a comma separated list of members.
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let sks_str = self
|
||||
.members
|
||||
.iter()
|
||||
.map(|m| m.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "({})", sks_str.join(", "))
|
||||
}
|
||||
}
|
||||
|
||||
/// Safekeeper membership configuration.
|
||||
/// Note: it is a part of both control file and http API.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct Configuration {
|
||||
/// Unique id.
|
||||
pub generation: Generation,
|
||||
/// Current members of the configuration.
|
||||
pub members: MemberSet,
|
||||
/// Some means it is a joint conf.
|
||||
pub new_members: Option<MemberSet>,
|
||||
}
|
||||
|
||||
impl Configuration {
|
||||
/// Used for pre-generations timelines, will be removed eventually.
|
||||
pub fn empty() -> Self {
|
||||
Configuration {
|
||||
generation: INVALID_GENERATION,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Configuration {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"gen={}, members={}, new_members={}",
|
||||
self.generation,
|
||||
self.members,
|
||||
self.new_members
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or(String::from("none"))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{MemberSet, SafekeeperId};
|
||||
use utils::id::NodeId;
|
||||
|
||||
#[test]
|
||||
fn test_member_set() {
|
||||
let mut members = MemberSet::empty();
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(42),
|
||||
host: String::from("lala.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(42),
|
||||
host: String::from("lala.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.expect_err("duplicate must not be allowed");
|
||||
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(43),
|
||||
host: String::from("bubu.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
println!("members: {}", members);
|
||||
|
||||
let j = serde_json::to_string(&members).expect("failed to serialize");
|
||||
println!("members json: {}", j);
|
||||
assert_eq!(
|
||||
j,
|
||||
r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
//! Types used in safekeeper http API. Many of them are also reused internally.
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::SocketAddr;
|
||||
@@ -12,7 +11,7 @@ use utils::{
|
||||
pageserver_feedback::PageserverFeedback,
|
||||
};
|
||||
|
||||
use crate::{membership::Configuration, ServerInfo, Term};
|
||||
use crate::{ServerInfo, Term};
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SafekeeperStatus {
|
||||
@@ -23,16 +22,13 @@ pub struct SafekeeperStatus {
|
||||
pub struct TimelineCreateRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mconf: Configuration,
|
||||
pub peer_ids: Option<Vec<NodeId>>,
|
||||
pub pg_version: u32,
|
||||
pub system_id: Option<u64>,
|
||||
// By default WAL_SEGMENT_SIZE
|
||||
pub wal_seg_size: Option<u32>,
|
||||
pub start_lsn: Lsn,
|
||||
// Normal creation should omit this field (start_lsn initializes all LSNs).
|
||||
// However, we allow specifying custom value higher than start_lsn for
|
||||
// manual recovery case, see test_s3_wal_replay.
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
pub commit_lsn: Lsn,
|
||||
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
/// Same as TermLsn, but serializes LSN using display serializer
|
||||
@@ -147,13 +143,7 @@ pub type ConnectionId = u32;
|
||||
|
||||
/// Serialize is used only for json'ing in API response. Also used internally.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum WalSenderState {
|
||||
Vanilla(VanillaWalSenderState),
|
||||
Interpreted(InterpretedWalSenderState),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VanillaWalSenderState {
|
||||
pub struct WalSenderState {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub addr: SocketAddr,
|
||||
pub conn_id: ConnectionId,
|
||||
@@ -162,17 +152,6 @@ pub struct VanillaWalSenderState {
|
||||
pub feedback: ReplicationFeedback,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct InterpretedWalSenderState {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub shard: ShardIdentity,
|
||||
pub addr: SocketAddr,
|
||||
pub conn_id: ConnectionId,
|
||||
// postgres application_name
|
||||
pub appname: Option<String>,
|
||||
pub feedback: ReplicationFeedback,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalReceiverState {
|
||||
/// None means it is recovery initiated by us (this safekeeper).
|
||||
@@ -193,7 +172,6 @@ pub enum WalReceiverStatus {
|
||||
pub struct TimelineStatus {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mconf: Configuration,
|
||||
pub acceptor_state: AcceptorStateStatus,
|
||||
pub pg_info: ServerInfo,
|
||||
pub flush_lsn: Lsn,
|
||||
@@ -208,20 +186,6 @@ pub struct TimelineStatus {
|
||||
pub walreceivers: Vec<WalReceiverState>,
|
||||
}
|
||||
|
||||
/// Request to switch membership configuration.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TimelineMembershipSwitchRequest {
|
||||
pub mconf: Configuration,
|
||||
}
|
||||
|
||||
/// In response both previous and current configuration are sent.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineMembershipSwitchResponse {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
}
|
||||
|
||||
fn lsn_invalid() -> Lsn {
|
||||
Lsn::INVALID
|
||||
}
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
|
||||
//! don't block reads.
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::TryLockError;
|
||||
|
||||
pub struct GuardArcSwap<T> {
|
||||
inner: ArcSwap<T>,
|
||||
guard: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
pub struct Guard<'a, T> {
|
||||
_guard: tokio::sync::MutexGuard<'a, ()>,
|
||||
inner: &'a ArcSwap<T>,
|
||||
}
|
||||
|
||||
impl<T> GuardArcSwap<T> {
|
||||
pub fn new(inner: T) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(inner)),
|
||||
guard: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub async fn write_guard(&self) -> Guard<'_, T> {
|
||||
Guard {
|
||||
_guard: self.guard.lock().await,
|
||||
inner: &self.inner,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
|
||||
let guard = self.guard.try_lock()?;
|
||||
Ok(Guard {
|
||||
_guard: guard,
|
||||
inner: &self.inner,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Guard<'_, T> {
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub fn write(&mut self, value: T) {
|
||||
self.inner.store(Arc::new(value));
|
||||
}
|
||||
}
|
||||
@@ -98,8 +98,6 @@ pub mod try_rcu;
|
||||
|
||||
pub mod pprof;
|
||||
|
||||
pub mod guard_arc_swap;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
@@ -24,18 +24,3 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
remote_storage.workspace = true
|
||||
tokio-util.workspace = true
|
||||
serde_json.workspace = true
|
||||
futures.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
pprof.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench_interpret_wal"
|
||||
harness = false
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
## WAL Decoding and Interpretation Benchmarks
|
||||
|
||||
Note that these benchmarks pull WAL from a public bucket in S3
|
||||
as a preparation step. Hence, you need a way to auth with AWS.
|
||||
You can achieve this by copying the `~/.aws/config` file from
|
||||
the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking
|
||||
the benchmarks.
|
||||
|
||||
To run benchmarks:
|
||||
|
||||
```sh
|
||||
aws sso login --profile dev
|
||||
|
||||
# All benchmarks.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder
|
||||
|
||||
# Specific file.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal
|
||||
|
||||
# Specific benchmark.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded
|
||||
|
||||
# List available benchmarks.
|
||||
cargo bench --package wal_decoder --benches -- --list
|
||||
|
||||
# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
|
||||
# Output in target/criterion/*/profile/flamegraph.svg.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10
|
||||
```
|
||||
|
||||
Additional charts and statistics are available in `target/criterion/report/index.html`.
|
||||
|
||||
Benchmarks are automatically compared against the previous run. To compare against other runs, see
|
||||
`--baseline` and `--save-baseline`.
|
||||
@@ -1,250 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
|
||||
use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use serde::Deserialize;
|
||||
use std::{env, num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use remote_storage::{
|
||||
DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
use wal_decoder::models::InterpretedWalRecord;
|
||||
|
||||
const S3_BUCKET: &str = "neon-github-public-dev";
|
||||
const S3_REGION: &str = "eu-central-1";
|
||||
const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/";
|
||||
const METADATA_FILENAME: &str = "metadata.json";
|
||||
|
||||
/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
|
||||
/// This mirrors the configuration in bin/safekeeper.rs.
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
|
||||
async fn create_s3_client() -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: S3_BUCKET.to_string(),
|
||||
bucket_region: S3_REGION.to_string(),
|
||||
prefix_in_bucket: Some(BUCKET_PREFIX.to_string()),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: None,
|
||||
upload_storage_class: None,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
async fn download_bench_data(
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Utf8TempDir> {
|
||||
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?;
|
||||
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?;
|
||||
|
||||
eprintln!("Downloading benchmark data to {:?}", temp_dir);
|
||||
|
||||
let listing = client
|
||||
.list(None, ListingMode::NoDelimiter, None, cancel)
|
||||
.await?;
|
||||
|
||||
let mut downloads = listing
|
||||
.keys
|
||||
.into_iter()
|
||||
.map(|obj| {
|
||||
let client = client.clone();
|
||||
let temp_dir_path = temp_dir.path().to_owned();
|
||||
|
||||
async move {
|
||||
let remote_path = obj.key;
|
||||
let download = client
|
||||
.download(&remote_path, &DownloadOpts::default(), cancel)
|
||||
.await?;
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let file_name = remote_path.object_name().unwrap();
|
||||
let file_path = temp_dir_path.join(file_name);
|
||||
let file = tokio::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.write(true)
|
||||
.open(&file_path)
|
||||
.await?;
|
||||
|
||||
let mut writer = tokio::io::BufWriter::new(file);
|
||||
tokio::io::copy_buf(&mut body, &mut writer).await?;
|
||||
|
||||
Ok::<(), anyhow::Error>(())
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
while let Some(download) = downloads.next().await {
|
||||
download?;
|
||||
}
|
||||
|
||||
Ok(temp_dir)
|
||||
}
|
||||
|
||||
struct BenchmarkData {
|
||||
wal: Vec<u8>,
|
||||
meta: BenchmarkMetadata,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct BenchmarkMetadata {
|
||||
pg_version: u32,
|
||||
start_lsn: Lsn,
|
||||
}
|
||||
|
||||
async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result<BenchmarkData> {
|
||||
eprintln!("Loading benchmark data from {:?}", path);
|
||||
|
||||
let mut entries = tokio::fs::read_dir(path).await?;
|
||||
let mut ordered_segment_paths = Vec::new();
|
||||
let mut metadata = None;
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
if entry.file_name() == METADATA_FILENAME {
|
||||
let bytes = tokio::fs::read(entry.path()).await?;
|
||||
metadata = Some(
|
||||
serde_json::from_slice::<BenchmarkMetadata>(&bytes)
|
||||
.context("failed to deserialize metadata.json")?,
|
||||
);
|
||||
} else {
|
||||
ordered_segment_paths.push(entry.path());
|
||||
}
|
||||
}
|
||||
|
||||
ordered_segment_paths.sort();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for path in ordered_segment_paths {
|
||||
if buffer.len() >= input_size {
|
||||
break;
|
||||
}
|
||||
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
let file = tokio::fs::File::open(path).await?;
|
||||
let reader = tokio::io::BufReader::new(file);
|
||||
let decoder = ZstdDecoder::new(reader);
|
||||
let mut reader = tokio::io::BufReader::new(decoder);
|
||||
tokio::io::copy_buf(&mut reader, &mut buffer).await?;
|
||||
}
|
||||
|
||||
buffer.truncate(input_size);
|
||||
|
||||
Ok(BenchmarkData {
|
||||
wal: buffer,
|
||||
meta: metadata.unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
const INPUT_SIZE: usize = 128 * 1024 * 1024;
|
||||
|
||||
let setup_runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let (_temp_dir, bench_data) = setup_runtime.block_on(async move {
|
||||
let cancel = CancellationToken::new();
|
||||
let client = create_s3_client().await.unwrap();
|
||||
let temp_dir = download_bench_data(client, &cancel).await.unwrap();
|
||||
let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap();
|
||||
|
||||
(temp_dir, bench_data)
|
||||
});
|
||||
|
||||
eprintln!(
|
||||
"Benchmarking against {} MiB of WAL",
|
||||
INPUT_SIZE / 1024 / 1024
|
||||
);
|
||||
|
||||
let mut group = c.benchmark_group("decode-interpret-wal");
|
||||
group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64));
|
||||
group.sample_size(10);
|
||||
|
||||
group.bench_function("unsharded", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()]))
|
||||
});
|
||||
|
||||
let eight_shards = (0..8)
|
||||
.map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
group.bench_function("8/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &eight_shards))
|
||||
});
|
||||
|
||||
let four_shards = eight_shards
|
||||
.into_iter()
|
||||
.filter(|s| s.number.0 % 2 == 0)
|
||||
.collect::<Vec<_>>();
|
||||
group.bench_function("4/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &four_shards))
|
||||
});
|
||||
|
||||
let two_shards = four_shards
|
||||
.into_iter()
|
||||
.filter(|s| s.number.0 % 4 == 0)
|
||||
.collect::<Vec<_>>();
|
||||
group.bench_function("2/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &two_shards))
|
||||
});
|
||||
}
|
||||
|
||||
fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) {
|
||||
let r = decode_interpret(bench, shards);
|
||||
if let Err(e) = r {
|
||||
panic!("{e:?}");
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> {
|
||||
let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version);
|
||||
let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) {
|
||||
decoder.feed_bytes(chunk);
|
||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
||||
assert!(lsn.is_aligned());
|
||||
let _ = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
shard,
|
||||
lsn,
|
||||
bench.meta.pg_version,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
criterion_group!(
|
||||
name=benches;
|
||||
config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets=criterion_benchmark
|
||||
);
|
||||
criterion_main!(benches);
|
||||
@@ -1,8 +1,6 @@
|
||||
//! This module contains logic for decoding and interpreting
|
||||
//! raw bytes which represent a raw Postgres WAL record.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::models::*;
|
||||
use crate::serialized_batch::SerializedValueBatch;
|
||||
use bytes::{Buf, Bytes};
|
||||
@@ -16,15 +14,15 @@ use utils::lsn::Lsn;
|
||||
|
||||
impl InterpretedWalRecord {
|
||||
/// Decode and interpreted raw bytes which represent one Postgres WAL record.
|
||||
/// Data blocks which do not match any of the provided shard identities are filtered out.
|
||||
/// Data blocks which do not match the provided shard identity are filtered out.
|
||||
/// Shard 0 is a special case since it tracks all relation sizes. We only give it
|
||||
/// the keys that are being written as that is enough for updating relation sizes.
|
||||
pub fn from_bytes_filtered(
|
||||
buf: Bytes,
|
||||
shards: &[ShardIdentity],
|
||||
shard: &ShardIdentity,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
|
||||
) -> anyhow::Result<InterpretedWalRecord> {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(buf, &mut decoded, pg_version)?;
|
||||
let xid = decoded.xl_xid;
|
||||
@@ -35,57 +33,43 @@ impl InterpretedWalRecord {
|
||||
FlushUncommittedRecords::No
|
||||
};
|
||||
|
||||
let mut shard_records: HashMap<ShardIdentity, InterpretedWalRecord> =
|
||||
HashMap::with_capacity(shards.len());
|
||||
for shard in shards {
|
||||
shard_records.insert(
|
||||
*shard,
|
||||
InterpretedWalRecord {
|
||||
metadata_record: None,
|
||||
batch: SerializedValueBatch::default(),
|
||||
next_record_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
MetadataRecord::from_decoded_filtered(
|
||||
&decoded,
|
||||
&mut shard_records,
|
||||
next_record_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
SerializedValueBatch::from_decoded_filtered(
|
||||
let metadata_record =
|
||||
MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
|
||||
let batch = SerializedValueBatch::from_decoded_filtered(
|
||||
decoded,
|
||||
&mut shard_records,
|
||||
shard,
|
||||
next_record_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
|
||||
Ok(shard_records)
|
||||
Ok(InterpretedWalRecord {
|
||||
metadata_record,
|
||||
batch,
|
||||
next_record_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataRecord {
|
||||
/// Populates the given `shard_records` with metadata records from this WAL record, if any,
|
||||
/// discarding those belonging to other shards.
|
||||
/// Builds a metadata record for this WAL record, if any.
|
||||
///
|
||||
/// Only metadata records relevant for the given shards is emitted. Currently, most metadata
|
||||
/// Only metadata records relevant for the given shard are emitted. Currently, most metadata
|
||||
/// records are broadcast to all shards for simplicity, but this should be improved.
|
||||
fn from_decoded_filtered(
|
||||
decoded: &DecodedWALRecord,
|
||||
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
|
||||
shard: &ShardIdentity,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Note: this doesn't actually copy the bytes since
|
||||
// the [`Bytes`] type implements it via a level of indirection.
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
// First, generate metadata records from the decoded WAL record.
|
||||
let metadata_record = match decoded.xl_rmid {
|
||||
let mut metadata_record = match decoded.xl_rmid {
|
||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
||||
Self::decode_heapam_record(&mut buf, decoded, pg_version)?
|
||||
}
|
||||
@@ -128,65 +112,41 @@ impl MetadataRecord {
|
||||
};
|
||||
|
||||
// Next, filter the metadata record by shard.
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
match metadata_record {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)),
|
||||
) => {
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
let updated_old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
let updated_new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() {
|
||||
// Clone the record and update it for the current shard.
|
||||
let mut for_shard = metadata_record.clone();
|
||||
match for_shard {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
|
||||
ref mut clear_vm_bits,
|
||||
))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
|
||||
ref mut clear_vm_bits,
|
||||
)),
|
||||
) => {
|
||||
clear_vm_bits.old_heap_blkno = updated_old_heap_blkno;
|
||||
clear_vm_bits.new_heap_blkno = updated_new_heap_blkno;
|
||||
record.metadata_record = for_shard;
|
||||
}
|
||||
_ => {
|
||||
unreachable!("for_shard is a clone of what we checked above")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
|
||||
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
|
||||
if shard.is_shard_zero() {
|
||||
record.metadata_record = metadata_record;
|
||||
// No other shards should receive this record, so we stop traversing shards early.
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// All other metadata records are sent to all shards.
|
||||
record.metadata_record = metadata_record.clone();
|
||||
match metadata_record {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
|
||||
) => {
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
clear_vm_bits.old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
clear_vm_bits.new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
|
||||
{
|
||||
metadata_record = None
|
||||
}
|
||||
}
|
||||
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
|
||||
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
|
||||
if !shard.is_shard_zero() {
|
||||
metadata_record = None;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(metadata_record)
|
||||
}
|
||||
|
||||
fn decode_heapam_record(
|
||||
|
||||
@@ -48,7 +48,7 @@ pub mod proto {
|
||||
tonic::include_proto!("interpreted_wal");
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
@@ -64,7 +64,7 @@ pub struct InterpretedWalRecords {
|
||||
}
|
||||
|
||||
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct InterpretedWalRecord {
|
||||
/// Optional metadata record - may cause writes to metadata keys
|
||||
/// in the storage engine
|
||||
@@ -107,7 +107,7 @@ impl InterpretedWalRecord {
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
/// writes to the underlying storage engine.
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
@@ -123,12 +123,12 @@ pub enum MetadataRecord {
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
@@ -136,29 +136,29 @@ pub struct ClearVmBits {
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
@@ -166,32 +166,32 @@ pub struct DbaseCreate {
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
@@ -200,7 +200,7 @@ pub enum XactRecord {
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
@@ -209,73 +209,73 @@ pub struct XactCommon {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! Such batches are created from decoded PG wal records and ingested
|
||||
//! by the pageserver by writing directly to the ephemeral file.
|
||||
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
@@ -22,8 +22,6 @@ use utils::lsn::Lsn;
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use crate::models::InterpretedWalRecord;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
/// Accompanying metadata for the batch
|
||||
@@ -32,7 +30,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
/// relation sizes. In the case of "observed" values, we only need to know
|
||||
/// the key and LSN, so two types of metadata are supported to save on network
|
||||
/// bandwidth.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ValueMeta {
|
||||
Serialized(SerializedValueMeta),
|
||||
Observed(ObservedValueMeta),
|
||||
@@ -79,7 +77,7 @@ impl PartialEq for OrderedValueMeta {
|
||||
impl Eq for OrderedValueMeta {}
|
||||
|
||||
/// Metadata for a [`Value`] serialized into the batch.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SerializedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
@@ -91,14 +89,14 @@ pub struct SerializedValueMeta {
|
||||
}
|
||||
|
||||
/// Metadata for a [`Value`] observed by the batch
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ObservedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Batch of serialized [`Value`]s.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SerializedValueBatch {
|
||||
/// [`Value`]s serialized in EphemeralFile's native format,
|
||||
/// ready for disk write by the pageserver
|
||||
@@ -130,8 +128,7 @@ impl Default for SerializedValueBatch {
|
||||
}
|
||||
|
||||
impl SerializedValueBatch {
|
||||
/// Populates the given `shard_records` with value batches from this WAL record, if any,
|
||||
/// discarding those belonging to other shards.
|
||||
/// Build a batch of serialized values from a decoded PG WAL record
|
||||
///
|
||||
/// The batch will only contain values for keys targeting the specifiec
|
||||
/// shard. Shard 0 is a special case, where any keys that don't belong to
|
||||
@@ -139,20 +136,21 @@ impl SerializedValueBatch {
|
||||
/// but absent from the raw buffer [`SerializedValueBatch::raw`]).
|
||||
pub(crate) fn from_decoded_filtered(
|
||||
decoded: DecodedWALRecord,
|
||||
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
|
||||
shard: &ShardIdentity,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
// First determine how big the buffers need to be and allocate it up-front.
|
||||
) -> anyhow::Result<SerializedValueBatch> {
|
||||
// First determine how big the buffer needs to be and allocate it up-front.
|
||||
// This duplicates some of the work below, but it's empirically much faster.
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
assert!(record.batch.is_empty());
|
||||
|
||||
let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version);
|
||||
record.batch.raw = Vec::with_capacity(estimate);
|
||||
}
|
||||
let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
|
||||
let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
|
||||
|
||||
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
let mut len: usize = 0;
|
||||
for blk in decoded.blocks.iter() {
|
||||
let relative_off = buf.len() as u64;
|
||||
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
@@ -170,98 +168,99 @@ impl SerializedValueBatch {
|
||||
);
|
||||
}
|
||||
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%next_record_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
tracing::debug!(
|
||||
lsn=%next_record_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
record
|
||||
.batch
|
||||
.metadata
|
||||
.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let val = if Self::block_is_image(&decoded, blk, pg_version) {
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, next_record_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
let relative_off = record.batch.raw.len() as u64;
|
||||
|
||||
val.ser_into(&mut record.batch.raw)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
let val_ser_size = record.batch.raw.len() - relative_off as usize;
|
||||
|
||||
record
|
||||
.batch
|
||||
.metadata
|
||||
.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
metadata.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn);
|
||||
record.batch.len += 1;
|
||||
}))
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let val = if Self::block_is_image(&decoded, blk, pg_version) {
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, next_record_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
val.ser_into(&mut buf)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
let val_ser_size = buf.len() - relative_off as usize;
|
||||
|
||||
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
max_lsn = std::cmp::max(max_lsn, next_record_lsn);
|
||||
len += 1;
|
||||
}
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
// Validate that the batches are correct
|
||||
for record in shard_records.values() {
|
||||
record.batch.validate_lsn_order();
|
||||
}
|
||||
let batch = Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
};
|
||||
|
||||
batch.validate_lsn_order();
|
||||
|
||||
return Ok(batch);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
})
|
||||
}
|
||||
|
||||
/// Look into the decoded PG WAL record and determine
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -16,7 +16,6 @@ arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
bit_field.workspace = true
|
||||
bincode.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
@@ -45,7 +44,6 @@ postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
@@ -110,11 +108,3 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
harness = false
|
||||
|
||||
[[bin]]
|
||||
name = "test_helper_slow_client_reads"
|
||||
required-features = [ "testing" ]
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
//! Upload queue benchmarks.
|
||||
|
||||
use std::str::FromStr as _;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
|
||||
use pageserver::tenant::metadata::TimelineMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use utils::generation::Generation;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = bench_upload_queue_next_ready,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
|
||||
/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
|
||||
/// queue as a whole is thus quadratic.
|
||||
///
|
||||
/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
|
||||
/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
|
||||
fn bench_upload_queue_next_ready(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("upload_queue_next_ready");
|
||||
for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
|
||||
g.bench_function(format!("inprogress={inprogress}"), |b| {
|
||||
run_bench(b, inprogress).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
|
||||
// Construct two layers. layer0 is in the indexes, layer1 will be deleted.
|
||||
let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
|
||||
let metadata = LayerFileMetadata {
|
||||
shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
|
||||
generation: Generation::Valid(1),
|
||||
file_size: 0,
|
||||
};
|
||||
|
||||
// Construct the (initial and uploaded) index with layer0.
|
||||
let mut index = IndexPart::empty(TimelineMetadata::example());
|
||||
index.layer_metadata.insert(layer0, metadata.clone());
|
||||
|
||||
// Construct the queue.
|
||||
let mut queue = UploadQueue::Uninitialized;
|
||||
let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
|
||||
|
||||
// Populate inprogress_tasks with a bunch of layer1 deletions.
|
||||
let delete = UploadOp::Delete(Delete {
|
||||
layers: vec![(layer1, metadata)],
|
||||
});
|
||||
|
||||
for task_id in 0..(inprogress as u64) {
|
||||
queue.inprogress_tasks.insert(
|
||||
task_id,
|
||||
Arc::new(UploadTask {
|
||||
task_id,
|
||||
retries: AtomicU32::new(0),
|
||||
op: delete.clone(),
|
||||
coalesced_ops: Vec::new(),
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Benchmark index upload scheduling.
|
||||
let index_upload = UploadOp::UploadMetadata {
|
||||
uploaded: Box::new(index),
|
||||
};
|
||||
|
||||
b.iter(|| {
|
||||
queue.queued_operations.push_front(index_upload.clone());
|
||||
assert!(queue.next_ready().is_some());
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -4,9 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = [ "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::pin::Pin;
|
||||
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
@@ -13,6 +10,7 @@ use pageserver_api::{
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -64,28 +62,15 @@ impl Client {
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
|
||||
ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
},
|
||||
)));
|
||||
Ok(PagestreamClient {
|
||||
sink: PagestreamSender {
|
||||
shared: shared.clone(),
|
||||
sink,
|
||||
},
|
||||
stream: PagestreamReceiver {
|
||||
shared: shared.clone(),
|
||||
stream,
|
||||
},
|
||||
shared,
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -112,28 +97,7 @@ impl Client {
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: PagestreamSender,
|
||||
stream: PagestreamReceiver,
|
||||
}
|
||||
|
||||
pub struct PagestreamSender {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
|
||||
}
|
||||
|
||||
pub struct PagestreamReceiver {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
|
||||
}
|
||||
|
||||
enum PagestreamShared {
|
||||
ConnTaskRunning(ConnTaskRunning),
|
||||
ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
}
|
||||
struct ConnTaskRunning {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
@@ -146,11 +110,11 @@ pub struct RelTagBlockNo {
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(self) {
|
||||
let Self {
|
||||
shared,
|
||||
sink,
|
||||
stream,
|
||||
} = { self };
|
||||
// The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
copy_both,
|
||||
cancel_on_client_drop: cancel_conn_task,
|
||||
conn_task,
|
||||
} = self;
|
||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
||||
//
|
||||
@@ -167,77 +131,27 @@ impl PagestreamClient {
|
||||
//
|
||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
||||
// => https://github.com/neondatabase/neon/issues/6390
|
||||
let ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
} = {
|
||||
let mut guard = shared.lock().unwrap();
|
||||
match std::mem::replace(
|
||||
&mut *guard,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
) {
|
||||
PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
|
||||
}
|
||||
};
|
||||
let _ = cancel_on_client_drop.unwrap();
|
||||
let _ = cancel_conn_task.unwrap();
|
||||
conn_task.await.unwrap();
|
||||
|
||||
// Now drop the split copy_both.
|
||||
drop(sink);
|
||||
drop(stream);
|
||||
}
|
||||
|
||||
pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
|
||||
let Self {
|
||||
shared: _,
|
||||
sink,
|
||||
stream,
|
||||
} = self;
|
||||
(sink, stream)
|
||||
drop(copy_both);
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.getpage_send(req).await?;
|
||||
self.getpage_recv().await
|
||||
}
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.sink.getpage_send(req).await
|
||||
}
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.stream.getpage_recv().await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamSender {
|
||||
// TODO: maybe make this impl Sink instead for better composability?
|
||||
pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
|
||||
let msg = msg.serialize();
|
||||
self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.send(PagestreamFeMessage::GetPage(req)).await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamReceiver {
|
||||
// TODO: maybe make this impl Stream instead for better composability?
|
||||
pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
PagestreamBeMessage::deserialize(next)
|
||||
}
|
||||
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let next: PagestreamBeMessage = self.recv().await?;
|
||||
match next {
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
@@ -246,14 +160,7 @@ impl PagestreamReceiver {
|
||||
| PagestreamBeMessage::GetSlruSegment(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamBeMessage::Test(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
msg.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,11 +8,9 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
humantime.workspace = true
|
||||
itertools.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
|
||||
@@ -9,9 +9,7 @@ mod index_part;
|
||||
mod key;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
mod page_trace;
|
||||
|
||||
use page_trace::PageTraceCmd;
|
||||
use std::{
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -66,7 +64,6 @@ enum Commands {
|
||||
Layer(LayerCmd),
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
PageTrace(PageTraceCmd),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::BufReader;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::key::{CompactKey, Key};
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats.
|
||||
#[derive(Parser)]
|
||||
pub(crate) struct PageTraceCmd {
|
||||
/// Trace input file.
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> {
|
||||
let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?);
|
||||
let mut events: Vec<PageTraceEvent> = Vec::new();
|
||||
loop {
|
||||
match bincode::deserialize_from(&mut file) {
|
||||
Ok(event) => events.push(event),
|
||||
Err(err) => {
|
||||
if let bincode::ErrorKind::Io(ref err) = *err {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Err(err.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut reads_by_relation: HashMap<RelTag, i64> = HashMap::new();
|
||||
let mut reads_by_key: HashMap<CompactKey, i64> = HashMap::new();
|
||||
|
||||
for event in events {
|
||||
let key = Key::from_compact(event.key);
|
||||
let reltag = RelTag {
|
||||
spcnode: key.field2,
|
||||
dbnode: key.field3,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
};
|
||||
|
||||
*reads_by_relation.entry(reltag).or_default() += 1;
|
||||
*reads_by_key.entry(event.key).or_default() += 1;
|
||||
}
|
||||
|
||||
let multi_read_keys = reads_by_key
|
||||
.into_iter()
|
||||
.filter(|(_, count)| *count > 1)
|
||||
.sorted_by_key(|(key, count)| (-*count, *key))
|
||||
.collect_vec();
|
||||
|
||||
println!("Multi-read keys: {}", multi_read_keys.len());
|
||||
for (key, count) in multi_read_keys {
|
||||
println!(" {key}: {count}");
|
||||
}
|
||||
|
||||
let reads_by_relation = reads_by_relation
|
||||
.into_iter()
|
||||
.sorted_by_key(|(rel, count)| (-*count, *rel))
|
||||
.collect_vec();
|
||||
|
||||
println!("Reads by relation:");
|
||||
for (reltag, count) in reads_by_relation {
|
||||
println!(" {reltag}: {count}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,65 +0,0 @@
|
||||
use std::{
|
||||
io::{stdin, stdout, Read, Write},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
connstr: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let Args {
|
||||
connstr,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = Args::parse();
|
||||
let client = pageserver_client::page_service::Client::new(connstr).await?;
|
||||
let client = client.pagestream(tenant_id, timeline_id).await?;
|
||||
let (mut sender, _receiver) = client.split();
|
||||
|
||||
eprintln!("filling the pipe");
|
||||
let mut msg = 0;
|
||||
loop {
|
||||
msg += 1;
|
||||
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
|
||||
PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(23),
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
},
|
||||
));
|
||||
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
|
||||
eprintln!("pipe seems full");
|
||||
break;
|
||||
};
|
||||
let _: () = res?;
|
||||
}
|
||||
|
||||
let n = stdout().write(b"R")?;
|
||||
assert_eq!(n, 1);
|
||||
stdout().flush()?;
|
||||
|
||||
eprintln!("waiting for signal to tell us to exit");
|
||||
|
||||
let mut buf = [0u8; 1];
|
||||
stdin().read_exact(&mut buf)?;
|
||||
|
||||
eprintln!("termination signal received, exiting");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -27,7 +27,6 @@ use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::OffloadedTimelineInfo;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantConfigPatchRequest;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
@@ -52,9 +51,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
use scopeguard::defer;
|
||||
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -1524,71 +1521,6 @@ async fn timeline_gc_unblocking_handler(
|
||||
block_or_unblock_gc(request, false).await
|
||||
}
|
||||
|
||||
/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding.
|
||||
/// Use the `pagectl page-trace` command to decode and analyze the output.
|
||||
async fn timeline_page_trace_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024);
|
||||
let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5);
|
||||
|
||||
// Convert size limit to event limit based on the serialized size of an event. The event size is
|
||||
// fixed, as the default bincode serializer uses fixed-width integer encoding.
|
||||
let event_size = bincode::serialize(&PageTraceEvent::default())
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
.len();
|
||||
let event_limit = size_limit / event_size;
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
// Install a page trace, unless one is already in progress. We just use a buffered channel,
|
||||
// which may 2x the memory usage in the worst case, but it's still bounded.
|
||||
let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit);
|
||||
let cur = timeline.page_trace.load();
|
||||
let installed = cur.is_none()
|
||||
&& timeline
|
||||
.page_trace
|
||||
.compare_and_swap(cur, Some(Arc::new(trace_tx)))
|
||||
.is_none();
|
||||
if !installed {
|
||||
return Err(ApiError::Conflict("page trace already active".to_string()));
|
||||
}
|
||||
defer!(timeline.page_trace.store(None)); // uninstall on return
|
||||
|
||||
// Collect the trace and return it to the client. We could stream the response, but this is
|
||||
// simple and fine.
|
||||
let mut body = Vec::with_capacity(size_limit);
|
||||
let deadline = Instant::now() + Duration::from_secs(time_limit_secs);
|
||||
|
||||
while body.len() < size_limit {
|
||||
tokio::select! {
|
||||
event = trace_rx.recv() => {
|
||||
let Some(event) = event else {
|
||||
break; // shouldn't happen (sender doesn't close, unless timeline dropped)
|
||||
};
|
||||
bincode::serialize_into(&mut body, &event)
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
}
|
||||
_ = tokio::time::sleep_until(deadline) => break, // time limit reached
|
||||
_ = cancel.cancelled() => return Err(ApiError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(body))
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
|
||||
///
|
||||
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
|
||||
@@ -3547,10 +3479,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| api_handler(r, timeline_gc_unblocking_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace",
|
||||
|r| api_handler(r, timeline_page_trace_handler),
|
||||
)
|
||||
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
|
||||
api_handler(r, secondary_upload_handler)
|
||||
})
|
||||
|
||||
@@ -278,8 +278,6 @@ async fn import_wal(
|
||||
|
||||
let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
|
||||
|
||||
let shard = vec![*tline.get_shard_identity()];
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
|
||||
@@ -316,12 +314,10 @@ async fn import_wal(
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?
|
||||
.remove(tline.get_shard_identity())
|
||||
.unwrap();
|
||||
)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
@@ -415,7 +411,6 @@ pub async fn import_wal_from_tar(
|
||||
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = start_lsn;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
|
||||
let shard = vec![*tline.get_shard_identity()];
|
||||
|
||||
// Ingest wal until end_lsn
|
||||
info!("importing wal until {}", end_lsn);
|
||||
@@ -464,12 +459,10 @@ pub async fn import_wal_from_tar(
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?
|
||||
.remove(tline.get_shard_identity())
|
||||
.unwrap();
|
||||
)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
|
||||
@@ -1224,189 +1224,117 @@ pub(crate) struct SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
|
||||
timings: SmgrOpTimerState,
|
||||
}
|
||||
|
||||
/// The stages of request processing are represented by the enum variants.
|
||||
/// Used as part of [`SmgrOpTimerInner::timings`].
|
||||
///
|
||||
/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
|
||||
/// transition points.
|
||||
/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
|
||||
/// to the next state.
|
||||
///
|
||||
/// Each request goes through every stage, in all configurations.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
enum SmgrOpTimerState {
|
||||
Received {
|
||||
// In the future, we may want to track the full time the request spent
|
||||
// inside pageserver process (time spent in kernel buffers can't be tracked).
|
||||
// `received_at` would be used for that.
|
||||
#[allow(dead_code)]
|
||||
received_at: Instant,
|
||||
},
|
||||
Throttling {
|
||||
ThrottleDoneExecutionStarting {
|
||||
received_at: Instant,
|
||||
throttle_started_at: Instant,
|
||||
started_execution_at: Instant,
|
||||
},
|
||||
Batching {
|
||||
throttle_done_at: Instant,
|
||||
},
|
||||
Executing {
|
||||
execution_started_at: Instant,
|
||||
},
|
||||
Flushing,
|
||||
// NB: when adding observation points, remember to update the Drop impl.
|
||||
}
|
||||
|
||||
// NB: when adding observation points, remember to update the Drop impl.
|
||||
impl SmgrOpTimer {
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
|
||||
return;
|
||||
};
|
||||
inner.throttling.count_accounted_start.inc();
|
||||
inner.timings = SmgrOpTimerState::Throttling {
|
||||
throttle_started_at: at,
|
||||
};
|
||||
}
|
||||
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Throttling {
|
||||
throttle_started_at,
|
||||
} = &inner.timings
|
||||
else {
|
||||
return;
|
||||
};
|
||||
inner.throttling.count_accounted_finish.inc();
|
||||
match throttle {
|
||||
ThrottleResult::NotThrottled { end } => {
|
||||
inner.timings = SmgrOpTimerState::Batching {
|
||||
throttle_done_at: end,
|
||||
};
|
||||
}
|
||||
ThrottleResult::Throttled { end } => {
|
||||
// update metrics
|
||||
inner.throttling.count_throttled.inc();
|
||||
inner
|
||||
.throttling
|
||||
.wait_time
|
||||
.inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Batching {
|
||||
throttle_done_at: end,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_execution_start(&mut self, at: Instant) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
|
||||
return;
|
||||
};
|
||||
// update metrics
|
||||
let batch = at - *throttle_done_at;
|
||||
inner.global_batch_wait_time.observe(batch.as_secs_f64());
|
||||
inner
|
||||
.per_timeline_batch_wait_time
|
||||
.observe(batch.as_secs_f64());
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Executing {
|
||||
execution_started_at: at,
|
||||
}
|
||||
}
|
||||
|
||||
/// For all but the first caller, this is a no-op.
|
||||
/// The first callers receives Some, subsequent ones None.
|
||||
///
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_execution_end_flush_start(
|
||||
&mut self,
|
||||
at: Instant,
|
||||
) -> Option<SmgrOpFlushInProgress> {
|
||||
// NB: unlike the other observe_* methods, this one take()s.
|
||||
#[allow(clippy::question_mark)] // maintain similar code pattern.
|
||||
let Some(mut inner) = self.0.take() else {
|
||||
return None;
|
||||
};
|
||||
let SmgrOpTimerState::Executing {
|
||||
execution_started_at,
|
||||
} = &inner.timings
|
||||
else {
|
||||
return None;
|
||||
};
|
||||
// update metrics
|
||||
let execution = at - *execution_started_at;
|
||||
inner
|
||||
.global_execution_latency_histo
|
||||
.observe(execution.as_secs_f64());
|
||||
if let Some(per_timeline_execution_latency_histo) =
|
||||
&inner.per_timeline_execution_latency_histo
|
||||
{
|
||||
per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
|
||||
}
|
||||
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Flushing;
|
||||
|
||||
// return the flush in progress object which
|
||||
// will do the remaining metrics updates
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
Some(SmgrOpFlushInProgress {
|
||||
flush_started_at: at,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The last stage of request processing is serializing and flushing the request
|
||||
/// into the TCP connection. We want to make slow flushes observable
|
||||
/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
|
||||
/// to periodically bump the metric.
|
||||
///
|
||||
/// If in the future we decide that we're not interested in live updates, we can
|
||||
/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
|
||||
/// and remove this struct from the code base.
|
||||
pub(crate) struct SmgrOpFlushInProgress {
|
||||
flush_started_at: Instant,
|
||||
global_micros: IntCounter,
|
||||
per_timeline_micros: IntCounter,
|
||||
}
|
||||
|
||||
impl SmgrOpTimer {
|
||||
pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
|
||||
let inner = self.0.as_mut().expect("other public methods consume self");
|
||||
match (&mut inner.timings, throttle) {
|
||||
(SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
|
||||
ThrottleResult::NotThrottled { start } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *received_at,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *start,
|
||||
};
|
||||
}
|
||||
ThrottleResult::Throttled { start, end } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *start,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *end,
|
||||
};
|
||||
}
|
||||
},
|
||||
(x, _) => panic!("called in unexpected state: {x:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
|
||||
let (flush_start, inner) = self
|
||||
.smgr_op_end()
|
||||
.expect("this method consume self, and the only other caller is drop handler");
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
SmgrOpFlushInProgress {
|
||||
flush_started_at: flush_start,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `None`` if this method has already been called, `Some` otherwise.
|
||||
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
|
||||
let inner = self.0.take()?;
|
||||
|
||||
let now = Instant::now();
|
||||
|
||||
let batch;
|
||||
let execution;
|
||||
let throttle;
|
||||
match inner.timings {
|
||||
SmgrOpTimerState::Received { received_at } => {
|
||||
batch = (now - received_at).as_secs_f64();
|
||||
// TODO: use label for dropped requests.
|
||||
// This is quite rare in practice, only during tenant/pageservers shutdown.
|
||||
throttle = Duration::ZERO;
|
||||
execution = Duration::ZERO.as_secs_f64();
|
||||
}
|
||||
SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at,
|
||||
throttle_started_at,
|
||||
started_execution_at,
|
||||
} => {
|
||||
batch = (throttle_started_at - received_at).as_secs_f64();
|
||||
throttle = started_execution_at - throttle_started_at;
|
||||
execution = (now - started_execution_at).as_secs_f64();
|
||||
}
|
||||
}
|
||||
|
||||
// update time spent in batching
|
||||
inner.global_batch_wait_time.observe(batch);
|
||||
inner.per_timeline_batch_wait_time.observe(batch);
|
||||
|
||||
// time spent in throttle metric is updated by throttle impl
|
||||
let _ = throttle;
|
||||
|
||||
// update metrics for execution latency
|
||||
inner.global_execution_latency_histo.observe(execution);
|
||||
if let Some(per_timeline_execution_latency_histo) =
|
||||
&inner.per_timeline_execution_latency_histo
|
||||
{
|
||||
per_timeline_execution_latency_histo.observe(execution);
|
||||
}
|
||||
|
||||
Some((now, inner))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for SmgrOpTimer {
|
||||
fn drop(&mut self) {
|
||||
// In case of early drop, update any of the remaining metrics with
|
||||
// observations so that (started,finished) counter pairs balance out
|
||||
// and all counters on the latency path have the the same number of
|
||||
// observations.
|
||||
// It's technically lying and it would be better if each metric had
|
||||
// a separate label or similar for cancelled requests.
|
||||
// But we don't have that right now and counter pairs balancing
|
||||
// out is useful when using the metrics in panels and whatnot.
|
||||
let now = Instant::now();
|
||||
self.observe_throttle_start(now);
|
||||
self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
|
||||
self.observe_execution_start(now);
|
||||
self.observe_execution_end_flush_start(now);
|
||||
self.smgr_op_end();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1417,12 +1345,12 @@ impl SmgrOpFlushInProgress {
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
let now = Instant::now();
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
let now = Instant::now();
|
||||
let elapsed = now - self.flush_started_at;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
@@ -1463,10 +1391,9 @@ pub enum SmgrQueryType {
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
GetSlruSegment,
|
||||
#[cfg(feature = "testing")]
|
||||
Test,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_started: [IntCounter; SmgrQueryType::COUNT],
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
@@ -1478,7 +1405,6 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
global_batch_wait_time: Histogram,
|
||||
per_timeline_batch_wait_time: Histogram,
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -1684,11 +1610,7 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
|
||||
) -> Self {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
@@ -1749,7 +1671,6 @@ impl SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros,
|
||||
global_batch_wait_time,
|
||||
per_timeline_batch_wait_time,
|
||||
throttling: pagestream_throttle_metrics,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
|
||||
@@ -1765,24 +1686,88 @@ impl SmgrQueryTimePerTimeline {
|
||||
SmgrOpTimer(Some(SmgrOpTimerInner {
|
||||
global_execution_latency_histo: self.global_latency[op as usize].clone(),
|
||||
per_timeline_execution_latency_histo: per_timeline_latency_histo,
|
||||
timings: SmgrOpTimerState::Received { received_at },
|
||||
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
|
||||
per_timeline_flush_in_progress_micros: self
|
||||
.per_timeline_flush_in_progress_micros
|
||||
.clone(),
|
||||
global_batch_wait_time: self.global_batch_wait_time.clone(),
|
||||
per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
|
||||
throttling: self.throttling.clone(),
|
||||
timings: SmgrOpTimerState::Received { received_at },
|
||||
}))
|
||||
}
|
||||
|
||||
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
|
||||
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
|
||||
self.global_batch_size.observe(batch_size as f64);
|
||||
self.per_timeline_batch_size.observe(batch_size as f64);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use std::time::Instant;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
use super::SmgrQueryType::*;
|
||||
let expect: [(super::SmgrQueryType, &'static str); 5] = [
|
||||
(GetRelExists, "get_rel_exists"),
|
||||
(GetRelSize, "get_rel_size"),
|
||||
(GetPageAtLsn, "get_page_at_lsn"),
|
||||
(GetDbSize, "get_db_size"),
|
||||
(GetSlruSegment, "get_slru_segment"),
|
||||
];
|
||||
for (op, expect) in expect {
|
||||
let actual: &'static str = op.into();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
|
||||
|
||||
for op in &ops {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(
|
||||
&TenantShardId::unsharded(tenant_id),
|
||||
&timeline_id,
|
||||
);
|
||||
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
|
||||
.sum();
|
||||
(
|
||||
global,
|
||||
metrics.per_timeline_getpage_latency.get_sample_count(),
|
||||
)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let timer = metrics.start_smgr_op(*op, Instant::now());
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
|
||||
// getpage ops are tracked per-timeline, others aren't
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
} else {
|
||||
assert_eq!(post_per_tenant_timeline, 0);
|
||||
}
|
||||
assert!(post_global > pre_global);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
@@ -3578,7 +3563,9 @@ pub(crate) mod tenant_throttling {
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
pub(crate) struct GlobalAndPerTenantIntCounter {
|
||||
use crate::tenant::{self};
|
||||
|
||||
struct GlobalAndPerTenantIntCounter {
|
||||
global: IntCounter,
|
||||
per_tenant: IntCounter,
|
||||
}
|
||||
@@ -3596,10 +3583,10 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
|
||||
pub(crate) struct Metrics<const KIND: usize> {
|
||||
pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
pub(super) wait_time: GlobalAndPerTenantIntCounter,
|
||||
pub(super) count_throttled: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
wait_time: GlobalAndPerTenantIntCounter,
|
||||
count_throttled: GlobalAndPerTenantIntCounter,
|
||||
}
|
||||
|
||||
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
@@ -3734,6 +3721,26 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
|
||||
#[inline(always)]
|
||||
fn accounting_start(&self) {
|
||||
self.count_accounted_start.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn accounting_finish(&self) {
|
||||
self.count_accounted_finish.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn observe_throttling(
|
||||
&self,
|
||||
tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
|
||||
) {
|
||||
let val = u64::try_from(wait_time.as_micros()).unwrap();
|
||||
self.wait_time.inc_by(val);
|
||||
self.count_throttled.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod disk_usage_based_eviction {
|
||||
|
||||
@@ -67,7 +67,6 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::{basebackup, timed_after_cancellation};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -555,52 +554,37 @@ struct BatchedGetPageRequest {
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct BatchedTestRequest {
|
||||
req: models::PagestreamTestRequest,
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
|
||||
/// so that we don't keep the [`Timeline::gate`] open while the batch
|
||||
/// is being built up inside the [`spsc_fold`] (pagestream pipelining).
|
||||
enum BatchedFeMessage {
|
||||
Exists {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamExistsRequest,
|
||||
},
|
||||
Nblocks {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamNblocksRequest,
|
||||
},
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
},
|
||||
DbSize {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamDbSizeRequest,
|
||||
},
|
||||
GetSlruSegment {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamGetSlruSegmentRequest,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
span: Span,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
},
|
||||
RespondError {
|
||||
span: Span,
|
||||
error: BatchedPageStreamError,
|
||||
@@ -608,27 +592,43 @@ enum BatchedFeMessage {
|
||||
}
|
||||
|
||||
impl BatchedFeMessage {
|
||||
fn observe_execution_start(&mut self, at: Instant) {
|
||||
match self {
|
||||
BatchedFeMessage::Exists { timer, .. }
|
||||
| BatchedFeMessage::Nblocks { timer, .. }
|
||||
| BatchedFeMessage::DbSize { timer, .. }
|
||||
| BatchedFeMessage::GetSlruSegment { timer, .. } => {
|
||||
timer.observe_execution_start(at);
|
||||
async fn throttle_and_record_start_processing(
|
||||
&mut self,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError> {
|
||||
let (shard, tokens, timers) = match self {
|
||||
BatchedFeMessage::Exists { shard, timer, .. }
|
||||
| BatchedFeMessage::Nblocks { shard, timer, .. }
|
||||
| BatchedFeMessage::DbSize { shard, timer, .. }
|
||||
| BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
|
||||
(
|
||||
shard,
|
||||
// 1 token is probably under-estimating because these
|
||||
// request handlers typically do several Timeline::get calls.
|
||||
1,
|
||||
itertools::Either::Left(std::iter::once(timer)),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetPage { pages, .. } => {
|
||||
for page in pages {
|
||||
page.timer.observe_execution_start(at);
|
||||
}
|
||||
BatchedFeMessage::GetPage { shard, pages, .. } => (
|
||||
shard,
|
||||
pages.len(),
|
||||
itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
|
||||
),
|
||||
BatchedFeMessage::RespondError { .. } => return Ok(()),
|
||||
};
|
||||
let throttled = tokio::select! {
|
||||
throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
|
||||
_ = shard.cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test { requests, .. } => {
|
||||
for req in requests {
|
||||
req.timer.observe_execution_start(at);
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
};
|
||||
for timer in timers {
|
||||
timer.observe_throttle_done_execution_starting(&throttled);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -720,26 +720,6 @@ impl PageServerHandler {
|
||||
let neon_fe_msg =
|
||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||
|
||||
// TODO: turn in to async closure once available to avoid repeating received_at
|
||||
async fn record_op_start_and_throttle(
|
||||
shard: &timeline::handle::Handle<TenantManagerTypes>,
|
||||
op: metrics::SmgrQueryType,
|
||||
received_at: Instant,
|
||||
) -> Result<SmgrOpTimer, QueryError> {
|
||||
// It's important to start the smgr op metric recorder as early as possible
|
||||
// so that the _started counters are incremented before we do
|
||||
// any serious waiting, e.g., for throttle, batching, or actual request handling.
|
||||
let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
|
||||
let now = Instant::now();
|
||||
timer.observe_throttle_start(now);
|
||||
let throttled = tokio::select! {
|
||||
res = shard.pagestream_throttle.throttle(1, now) => res,
|
||||
_ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
|
||||
};
|
||||
timer.observe_throttle_done(throttled);
|
||||
Ok(timer)
|
||||
}
|
||||
|
||||
let batched_msg = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
@@ -747,16 +727,13 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelExists,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
timer,
|
||||
shard: shard.downgrade(),
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -766,16 +743,13 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelSize,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
|
||||
BatchedFeMessage::Nblocks {
|
||||
span,
|
||||
timer,
|
||||
shard: shard.downgrade(),
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -785,16 +759,13 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetDbSize,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
|
||||
BatchedFeMessage::DbSize {
|
||||
span,
|
||||
timer,
|
||||
shard: shard.downgrade(),
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -804,16 +775,13 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetSlruSegment,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
span,
|
||||
timer,
|
||||
shard: shard.downgrade(),
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -858,14 +826,13 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetPageAtLsn,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
// It's important to start the timer before waiting for the LSN
|
||||
// so that the _started counters are incremented before we do
|
||||
// any serious waiting, e.g., for LSNs.
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
|
||||
|
||||
// We're holding the Handle
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
req.hdr.request_lsn,
|
||||
@@ -883,27 +850,11 @@ impl PageServerHandler {
|
||||
};
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessage::Test(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer =
|
||||
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
|
||||
.await?;
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
requests: vec![BatchedTestRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Some(batched_msg))
|
||||
}
|
||||
@@ -945,7 +896,9 @@ impl PageServerHandler {
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
|
||||
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
|
||||
{
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
@@ -964,44 +917,6 @@ impl PageServerHandler {
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
Ok(BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
}),
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) if (|| {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
@@ -1022,13 +937,6 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
let batch = {
|
||||
let mut batch = batch;
|
||||
batch.observe_execution_start(started_at);
|
||||
batch
|
||||
};
|
||||
|
||||
// invoke handler function
|
||||
let (handler_results, span): (
|
||||
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
|
||||
@@ -1043,7 +951,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
|
||||
.handle_get_rel_exists_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1060,7 +968,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
|
||||
.handle_get_nblocks_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1081,7 +989,7 @@ impl PageServerHandler {
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&*shard.upgrade()?,
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
ctx,
|
||||
@@ -1103,7 +1011,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
(
|
||||
vec![self
|
||||
.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
|
||||
.handle_db_size_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1120,7 +1028,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
|
||||
.handle_get_slru_segment_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1128,27 +1036,6 @@ impl PageServerHandler {
|
||||
span,
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard,
|
||||
requests,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::test");
|
||||
(
|
||||
{
|
||||
let npages = requests.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::RespondError { span, error } => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
@@ -1216,11 +1103,8 @@ impl PageServerHandler {
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
let flushing_timer = timer.map(|mut timer| {
|
||||
timer
|
||||
.observe_execution_end_flush_start(Instant::now())
|
||||
.expect("we are the first caller")
|
||||
});
|
||||
let flushing_timer =
|
||||
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
|
||||
|
||||
// what we want to do
|
||||
let flush_fut = pgb_writer.flush();
|
||||
@@ -1374,7 +1258,7 @@ impl PageServerHandler {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => break e,
|
||||
};
|
||||
let msg = match msg {
|
||||
let mut msg = match msg {
|
||||
Some(msg) => msg,
|
||||
None => {
|
||||
debug!("pagestream subprotocol end observed");
|
||||
@@ -1382,6 +1266,10 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
|
||||
break cancelled;
|
||||
}
|
||||
|
||||
let err = self
|
||||
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
|
||||
.await;
|
||||
@@ -1541,12 +1429,15 @@ impl PageServerHandler {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let batch = match batch {
|
||||
let mut batch = match batch {
|
||||
Ok(batch) => batch,
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
batch
|
||||
.throttle_and_record_start_processing(&self.cancel)
|
||||
.await?;
|
||||
self.pagesteam_handle_batched_message(
|
||||
pgb_writer,
|
||||
batch,
|
||||
@@ -1814,20 +1705,6 @@ impl PageServerHandler {
|
||||
.query_metrics
|
||||
.observe_getpage_batch_start(requests.len());
|
||||
|
||||
// If a page trace is running, submit an event for this request.
|
||||
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
|
||||
let time = SystemTime::now();
|
||||
for batch in &requests {
|
||||
let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact();
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
|
||||
@@ -1886,51 +1763,6 @@ impl PageServerHandler {
|
||||
))
|
||||
}
|
||||
|
||||
// NB: this impl mimics what we do for batched getpage requests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_test_request_batch(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
// real requests would do something with the timeline
|
||||
let mut results = Vec::with_capacity(requests.len());
|
||||
for _req in requests.iter() {
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
results.push({
|
||||
if timeline.cancel.is_cancelled() {
|
||||
Err(PageReconstructError::Cancelled)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: avoid creating the new Vec here
|
||||
Vec::from_iter(
|
||||
requests
|
||||
.into_iter()
|
||||
.zip(results.into_iter())
|
||||
.map(|(req, res)| {
|
||||
res.map(|()| {
|
||||
(
|
||||
PagestreamBeMessage::Test(models::PagestreamTestResponse {
|
||||
req: req.req.clone(),
|
||||
}),
|
||||
req.timer,
|
||||
)
|
||||
})
|
||||
.map_err(|e| BatchedPageStreamError {
|
||||
err: PageStreamError::from(e),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// Note on "fullbackup":
|
||||
/// Full basebackups should only be used for debugging purposes.
|
||||
/// Originally, it was introduced to enable breaking storage format changes,
|
||||
@@ -2546,14 +2378,6 @@ impl From<GetActiveTimelineError> for QueryError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::timeline::handle::HandleUpgradeError> for QueryError {
|
||||
fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self {
|
||||
match e {
|
||||
crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_tracing_field_shard_id(timeline: &Timeline) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
tracing::Span::current().record(
|
||||
|
||||
@@ -365,9 +365,8 @@ pub struct Tenant {
|
||||
|
||||
/// Throttle applied at the top of [`Timeline::get`].
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
|
||||
|
||||
pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
|
||||
/// An ongoing timeline detach concurrency limiter.
|
||||
///
|
||||
@@ -1688,7 +1687,6 @@ impl Tenant {
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
LoadTimelineCause::Attach,
|
||||
@@ -3994,9 +3992,6 @@ impl Tenant {
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
|
||||
/// to ensure proper cleanup of background tasks and metrics.
|
||||
//
|
||||
// Allow too_many_arguments because a constructor's argument list naturally grows with the
|
||||
// number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -4105,10 +4100,8 @@ impl Tenant {
|
||||
gate: Gate::default(),
|
||||
pagestream_throttle: Arc::new(throttle::Throttle::new(
|
||||
Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
|
||||
)),
|
||||
pagestream_throttle_metrics: Arc::new(
|
||||
crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
|
||||
),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
@@ -5015,7 +5008,6 @@ impl Tenant {
|
||||
TimelineResources {
|
||||
remote_client: self.build_timeline_remote_client(timeline_id),
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,6 +320,7 @@ impl TimelineMetadata {
|
||||
|
||||
// Checksums make it awkward to build a valid instance by hand. This helper
|
||||
// provides a TimelineMetadata with a valid checksum in its header.
|
||||
#[cfg(test)]
|
||||
pub fn example() -> Self {
|
||||
let instance = Self::new(
|
||||
"0/16960E8".parse::<Lsn>().unwrap(),
|
||||
|
||||
@@ -63,18 +63,22 @@
|
||||
//! The contract between client and its user is that the user is responsible of
|
||||
//! scheduling operations in an order that keeps the remote consistent as
|
||||
//! described above.
|
||||
//!
|
||||
//! From the user's perspective, the operations are executed sequentially.
|
||||
//! Internally, the client knows which operations can be performed in parallel,
|
||||
//! and which operations act like a "barrier" that require preceding operations
|
||||
//! to finish. The calling code just needs to call the schedule-functions in the
|
||||
//! correct order, and the client will parallelize the operations in a way that
|
||||
//! is safe. For more details, see `UploadOp::can_bypass`.
|
||||
//! is safe.
|
||||
//!
|
||||
//! The caller should be careful with deletion, though. They should not delete
|
||||
//! local files that have been scheduled for upload but not yet finished uploading.
|
||||
//! Otherwise the upload will fail. To wait for an upload to finish, use
|
||||
//! the 'wait_completion' function (more on that later.)
|
||||
//!
|
||||
//! All of this relies on the following invariants:
|
||||
//!
|
||||
//! - We rely on read-after write consistency in the remote storage.
|
||||
//! - Layer files are immutable.
|
||||
//! - Layer files are immutable
|
||||
//!
|
||||
//! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
|
||||
//! storage. Different tenants can be attached to different pageservers, but if the
|
||||
@@ -382,12 +386,6 @@ pub(crate) struct RemoteTimelineClient {
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClient {
|
||||
fn drop(&mut self) {
|
||||
debug!("dropping RemoteTimelineClient");
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
@@ -431,16 +429,8 @@ impl RemoteTimelineClient {
|
||||
/// an index file upload, i.e., it's not empty.
|
||||
/// The given `index_part` must be the one on the remote.
|
||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
info!(
|
||||
"initialized upload queue from remote index with {} layer files",
|
||||
@@ -455,16 +445,8 @@ impl RemoteTimelineClient {
|
||||
&self,
|
||||
local_metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
@@ -480,15 +462,9 @@ impl RemoteTimelineClient {
|
||||
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
|
||||
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
||||
))?;
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
self.stop_impl(&mut upload_queue);
|
||||
|
||||
@@ -1879,17 +1855,57 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Pick next tasks from the queue, and start as many of them as possible without violating
|
||||
/// the ordering constraints.
|
||||
///
|
||||
/// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
|
||||
/// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
|
||||
/// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
|
||||
/// The caller needs to already hold the `upload_queue` lock.
|
||||
fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
|
||||
debug!("starting op: {next_op}");
|
||||
while let Some(next_op) = upload_queue.queued_operations.front() {
|
||||
// Can we run this task now?
|
||||
let can_run_now = match next_op {
|
||||
UploadOp::UploadLayer(..) => {
|
||||
// Can always be scheduled.
|
||||
true
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
// These can only be performed after all the preceding operations
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
UploadOp::Delete(..) => {
|
||||
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
|
||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||
}
|
||||
|
||||
// Prepare upload.
|
||||
UploadOp::Barrier(_) | UploadOp::Shutdown => {
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
};
|
||||
|
||||
// If we cannot launch this task, don't look any further.
|
||||
//
|
||||
// In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
|
||||
// them now, but we don't try to do that currently. For example, if the frontmost task
|
||||
// is an index-file upload that cannot proceed until preceding uploads have finished, we
|
||||
// could still start layer uploads that were scheduled later.
|
||||
if !can_run_now {
|
||||
break;
|
||||
}
|
||||
|
||||
if let UploadOp::Shutdown = next_op {
|
||||
// leave the op in the queue but do not start more tasks; it will be dropped when
|
||||
// the stop is called.
|
||||
upload_queue.shutdown_ready.close();
|
||||
break;
|
||||
}
|
||||
|
||||
// We can launch this task. Remove it from the queue first.
|
||||
let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
|
||||
debug!("starting op: {}", next_op);
|
||||
|
||||
// Update the counters and prepare
|
||||
match &mut next_op {
|
||||
UploadOp::UploadLayer(layer, meta, mode) => {
|
||||
if upload_queue
|
||||
@@ -1900,14 +1916,18 @@ impl RemoteTimelineClient {
|
||||
} else {
|
||||
*mode = Some(OpType::MayReorder)
|
||||
}
|
||||
upload_queue.num_inprogress_layer_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {}
|
||||
UploadOp::Delete(Delete { layers }) => {
|
||||
for (name, meta) in layers {
|
||||
upload_queue
|
||||
.recently_deleted
|
||||
.insert((name.clone(), meta.generation));
|
||||
}
|
||||
upload_queue.num_inprogress_deletions += 1;
|
||||
}
|
||||
UploadOp::Barrier(sender) => {
|
||||
sender.send_replace(());
|
||||
@@ -1924,7 +1944,6 @@ impl RemoteTimelineClient {
|
||||
let task = Arc::new(UploadTask {
|
||||
task_id: upload_task_id,
|
||||
op: next_op,
|
||||
coalesced_ops,
|
||||
retries: AtomicU32::new(0),
|
||||
});
|
||||
upload_queue
|
||||
@@ -2008,8 +2027,6 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
|
||||
// TODO: check if this mechanism can be removed now that can_bypass() performs
|
||||
// conflict checks during scheduling.
|
||||
if let Some(OpType::FlushDeletion) = mode {
|
||||
if self.config.read().unwrap().block_deletions {
|
||||
// Of course, this is not efficient... but usually the queue should be empty.
|
||||
@@ -2232,8 +2249,13 @@ impl RemoteTimelineClient {
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
|
||||
let lsn_update = match task.op {
|
||||
UploadOp::UploadLayer(_, _, _) => None,
|
||||
UploadOp::UploadLayer(_, _, _) => {
|
||||
upload_queue.num_inprogress_layer_uploads -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::UploadMetadata { ref uploaded } => {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
|
||||
// the task id is reused as a monotonicity check for storing the "clean"
|
||||
// IndexPart.
|
||||
let last_updater = upload_queue.clean.1;
|
||||
@@ -2267,7 +2289,10 @@ impl RemoteTimelineClient {
|
||||
None
|
||||
}
|
||||
}
|
||||
UploadOp::Delete(_) => None,
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
|
||||
};
|
||||
|
||||
@@ -2292,9 +2317,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
self.metric_end(&task.op);
|
||||
for coalesced_op in &task.coalesced_ops {
|
||||
self.metric_end(coalesced_op);
|
||||
}
|
||||
}
|
||||
|
||||
fn metric_impl(
|
||||
@@ -2387,7 +2409,6 @@ impl RemoteTimelineClient {
|
||||
// but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
|
||||
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
|
||||
let upload_queue_for_deletion = UploadQueueInitialized {
|
||||
inprogress_limit: initialized.inprogress_limit,
|
||||
task_counter: 0,
|
||||
dirty: initialized.dirty.clone(),
|
||||
clean: initialized.clean.clone(),
|
||||
@@ -2395,6 +2416,9 @@ impl RemoteTimelineClient {
|
||||
visible_remote_consistent_lsn: initialized
|
||||
.visible_remote_consistent_lsn
|
||||
.clone(),
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::default(),
|
||||
queued_operations: VecDeque::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -2421,6 +2445,14 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
};
|
||||
|
||||
// consistency check
|
||||
assert_eq!(
|
||||
qi.num_inprogress_layer_uploads
|
||||
+ qi.num_inprogress_metadata_uploads
|
||||
+ qi.num_inprogress_deletions,
|
||||
qi.inprogress_tasks.len()
|
||||
);
|
||||
|
||||
// We don't need to do anything here for in-progress tasks. They will finish
|
||||
// on their own, decrement the unfinished-task counter themselves, and observe
|
||||
// that the queue is Stopped.
|
||||
@@ -2867,8 +2899,8 @@ mod tests {
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
assert!(upload_queue.queued_operations.is_empty());
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 2);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
|
||||
assert!(upload_queue.inprogress_tasks.len() == 2);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 2);
|
||||
|
||||
// also check that `latest_file_changes` was updated
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
|
||||
@@ -2938,8 +2970,8 @@ mod tests {
|
||||
// Deletion schedules upload of the index file, and the file deletion itself
|
||||
assert_eq!(upload_queue.queued_operations.len(), 2);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions(), 0);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions, 0);
|
||||
assert_eq!(
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
0
|
||||
|
||||
@@ -104,7 +104,7 @@ impl IndexPart {
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub fn empty(metadata: TimelineMetadata) -> Self {
|
||||
pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
|
||||
IndexPart {
|
||||
version: Self::LATEST_VERSION,
|
||||
layer_metadata: Default::default(),
|
||||
|
||||
@@ -1812,7 +1812,7 @@ enum LayerKind {
|
||||
|
||||
/// Guard for forcing a layer be resident while it exists.
|
||||
#[derive(Clone)]
|
||||
pub struct ResidentLayer {
|
||||
pub(crate) struct ResidentLayer {
|
||||
owner: Layer,
|
||||
downloaded: Arc<DownloadedLayer>,
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::Instant,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
@@ -16,8 +16,9 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
|
||||
/// To share a throttle among multiple entities, wrap it in an [`Arc`].
|
||||
///
|
||||
/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
|
||||
pub struct Throttle {
|
||||
pub struct Throttle<M: Metric> {
|
||||
inner: ArcSwap<Inner>,
|
||||
metric: M,
|
||||
/// will be turned into [`Stats::count_accounted_start`]
|
||||
count_accounted_start: AtomicU64,
|
||||
/// will be turned into [`Stats::count_accounted_finish`]
|
||||
@@ -35,6 +36,15 @@ pub struct Inner {
|
||||
|
||||
pub type Config = pageserver_api::models::ThrottleConfig;
|
||||
|
||||
pub struct Observation {
|
||||
pub wait_time: Duration,
|
||||
}
|
||||
pub trait Metric {
|
||||
fn accounting_start(&self);
|
||||
fn accounting_finish(&self);
|
||||
fn observe_throttling(&self, observation: &Observation);
|
||||
}
|
||||
|
||||
/// See [`Throttle::reset_stats`].
|
||||
pub struct Stats {
|
||||
/// Number of requests that started [`Throttle::throttle`] calls.
|
||||
@@ -49,14 +59,18 @@ pub struct Stats {
|
||||
}
|
||||
|
||||
pub enum ThrottleResult {
|
||||
NotThrottled { end: Instant },
|
||||
Throttled { end: Instant },
|
||||
NotThrottled { start: Instant },
|
||||
Throttled { start: Instant, end: Instant },
|
||||
}
|
||||
|
||||
impl Throttle {
|
||||
pub fn new(config: Config) -> Self {
|
||||
impl<M> Throttle<M>
|
||||
where
|
||||
M: Metric,
|
||||
{
|
||||
pub fn new(config: Config, metric: M) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
|
||||
metric,
|
||||
count_accounted_start: AtomicU64::new(0),
|
||||
count_accounted_finish: AtomicU64::new(0),
|
||||
count_throttled: AtomicU64::new(0),
|
||||
@@ -113,27 +127,32 @@ impl Throttle {
|
||||
self.inner.load().rate_limiter.steady_rps()
|
||||
}
|
||||
|
||||
/// `start` must be [`Instant::now`] or earlier.
|
||||
pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult {
|
||||
pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
if !inner.enabled {
|
||||
return ThrottleResult::NotThrottled { end: start };
|
||||
return ThrottleResult::NotThrottled { start };
|
||||
}
|
||||
|
||||
self.metric.accounting_start();
|
||||
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
|
||||
let did_throttle = inner.rate_limiter.acquire(key_count).await;
|
||||
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
|
||||
self.metric.accounting_finish();
|
||||
|
||||
if did_throttle {
|
||||
self.count_throttled.fetch_add(1, Ordering::Relaxed);
|
||||
let end = Instant::now();
|
||||
let wait_time = end - start;
|
||||
let now = Instant::now();
|
||||
let wait_time = now - start;
|
||||
self.sum_throttled_usecs
|
||||
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
|
||||
ThrottleResult::Throttled { end }
|
||||
let observation = Observation { wait_time };
|
||||
self.metric.observe_throttling(&observation);
|
||||
ThrottleResult::Throttled { start, end: now }
|
||||
} else {
|
||||
ThrottleResult::NotThrottled { end: start }
|
||||
ThrottleResult::NotThrottled { start }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use arc_swap::{ArcSwap, ArcSwapOption};
|
||||
use arc_swap::ArcSwap;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use chrono::{DateTime, Utc};
|
||||
@@ -23,7 +23,6 @@ use fail::fail_point;
|
||||
use handle::ShardTimelineId;
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
@@ -43,7 +42,6 @@ use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch},
|
||||
@@ -51,9 +49,7 @@ use tokio::{
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
fs_ext,
|
||||
guard_arc_swap::GuardArcSwap,
|
||||
pausable_failpoint,
|
||||
fs_ext, pausable_failpoint,
|
||||
postgres_client::PostgresClientProtocol,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
@@ -76,7 +72,6 @@ use std::{pin::pin, sync::OnceLock};
|
||||
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
page_service::TenantManagerTypes,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -213,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -356,8 +351,8 @@ pub struct Timeline {
|
||||
// though let's keep them both for better error visibility.
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// The repartitioning result. Allows a single writer and multiple readers.
|
||||
pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
/// When did we last calculate the partitioning? Make it pub to test cases.
|
||||
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -417,7 +412,8 @@ pub struct Timeline {
|
||||
gc_lock: tokio::sync::Mutex<()>,
|
||||
|
||||
/// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
|
||||
pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
|
||||
/// Size estimator for aux file v2
|
||||
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
|
||||
@@ -432,15 +428,12 @@ pub struct Timeline {
|
||||
|
||||
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) handles: handle::PerTimelineState<TenantManagerTypes>,
|
||||
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||
|
||||
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
|
||||
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
|
||||
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
|
||||
/// If Some, collects GetPage metadata for an ongoing PageTrace.
|
||||
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
|
||||
}
|
||||
|
||||
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
||||
@@ -2317,7 +2310,6 @@ impl Timeline {
|
||||
query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
resources.pagestream_throttle_metrics,
|
||||
),
|
||||
|
||||
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
|
||||
@@ -2343,8 +2335,7 @@ impl Timeline {
|
||||
// initial logical size is 0.
|
||||
LogicalSize::empty_initial()
|
||||
},
|
||||
|
||||
partitioning: GuardArcSwap::new((
|
||||
partitioning: tokio::sync::Mutex::new((
|
||||
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
|
||||
Lsn(0),
|
||||
)),
|
||||
@@ -2389,8 +2380,6 @@ impl Timeline {
|
||||
attach_wal_lag_cooldown,
|
||||
|
||||
create_idempotency,
|
||||
|
||||
page_trace: Default::default(),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -3792,34 +3781,35 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
|
||||
// So that the key ranges don't overlap.
|
||||
let mut partitions = KeyPartitioning::default();
|
||||
partitions.parts.extend(rel_partition.parts);
|
||||
if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
partitions
|
||||
.parts
|
||||
.extend(metadata_partition.into_dense().parts);
|
||||
}
|
||||
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
&partitions,
|
||||
&rel_partition,
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
&metadata_partition.into_dense(),
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
|
||||
(layers_to_upload, None)
|
||||
} else {
|
||||
@@ -4032,15 +4022,18 @@ impl Timeline {
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
let Ok(mut guard) = self.partitioning.try_write_guard() else {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
// Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
|
||||
// gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
|
||||
// heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called concurrently"
|
||||
"repartition() called concurrently, this is rare and a retry should be fine"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
@@ -4068,9 +4061,9 @@ impl Timeline {
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
}; // no partitioning for metadata keys for now
|
||||
let result = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
guard.write(result.clone());
|
||||
Ok(result)
|
||||
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
|
||||
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
@@ -4626,10 +4619,6 @@ impl Drop for Timeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Timeline {} for tenant {} is being dropped",
|
||||
self.timeline_id, self.tenant_shard_id.tenant_id
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1776,10 +1776,7 @@ impl Timeline {
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
if debug_mode {
|
||||
if cfg!(debug_assertions) {
|
||||
for (log_key, _, _) in full_history {
|
||||
assert_eq!(log_key, &key, "mismatched key");
|
||||
}
|
||||
@@ -1925,19 +1922,15 @@ impl Timeline {
|
||||
output
|
||||
}
|
||||
|
||||
let mut key_exists = false;
|
||||
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
|
||||
// TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
|
||||
records_since_last_image += split_for_lsn.len();
|
||||
// Whether to produce an image into the final layer files
|
||||
let produce_image = if i == 0 && !has_ancestor {
|
||||
let generate_image = if i == 0 && !has_ancestor {
|
||||
// We always generate images for the first batch (below horizon / lowest retain_lsn)
|
||||
true
|
||||
} else if i == batch_cnt - 1 {
|
||||
// Do not generate images for the last batch (above horizon)
|
||||
false
|
||||
} else if records_since_last_image == 0 {
|
||||
false
|
||||
} else if records_since_last_image >= delta_threshold_cnt {
|
||||
// Generate images when there are too many records
|
||||
true
|
||||
@@ -1952,45 +1945,29 @@ impl Timeline {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if replay_history.is_empty() && !key_exists {
|
||||
// The key does not exist at earlier LSN, we can skip this iteration.
|
||||
retention.push(Vec::new());
|
||||
continue;
|
||||
} else {
|
||||
key_exists = true;
|
||||
if let Some((_, _, val)) = replay_history.first() {
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
|
||||
|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
let Some((_, _, val)) = replay_history.first() else {
|
||||
unreachable!("replay history should not be empty once it exists")
|
||||
};
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
});
|
||||
}
|
||||
// Whether to reconstruct the image. In debug mode, we will generate an image
|
||||
// at every retain_lsn to ensure data is not corrupted, but we won't put the
|
||||
// image into the final layer.
|
||||
let generate_image = produce_image || debug_mode;
|
||||
if produce_image {
|
||||
if generate_image && records_since_last_image > 0 {
|
||||
records_since_last_image = 0;
|
||||
}
|
||||
let img_and_lsn = if generate_image {
|
||||
let replay_history_for_debug = if debug_mode {
|
||||
let replay_history_for_debug = if cfg!(debug_assertions) {
|
||||
Some(replay_history.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
|
||||
let history = if produce_image {
|
||||
std::mem::take(&mut replay_history)
|
||||
} else {
|
||||
replay_history.clone()
|
||||
};
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
|
||||
@@ -2027,20 +2004,8 @@ impl Timeline {
|
||||
}
|
||||
records.reverse();
|
||||
let state = ValueReconstructState { img, records };
|
||||
// last batch does not generate image so i is always in range, unless we force generate
|
||||
// an image during testing
|
||||
let request_lsn = if i >= lsn_split_points.len() {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
lsn_split_points[i]
|
||||
};
|
||||
let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
|
||||
let img = self.reconstruct_value(key, request_lsn, state).await?;
|
||||
Some((request_lsn, img))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if produce_image {
|
||||
let (request_lsn, img) = img_and_lsn.unwrap();
|
||||
replay_history.push((key, request_lsn, Value::Image(img.clone())));
|
||||
retention.push(vec![(request_lsn, Value::Image(img))]);
|
||||
} else {
|
||||
@@ -2146,7 +2111,12 @@ impl Timeline {
|
||||
let mut compact_jobs = Vec::new();
|
||||
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
|
||||
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
|
||||
let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
|
||||
let ((dense_ks, sparse_ks), _) = {
|
||||
let Ok(partition) = self.partitioning.try_lock() else {
|
||||
bail!("failed to acquire partition lock during gc-compaction");
|
||||
};
|
||||
partition.clone()
|
||||
};
|
||||
// Truncate the key range to be within user specified compaction range.
|
||||
fn truncate_to(
|
||||
source_start: &Key,
|
||||
@@ -2303,8 +2273,6 @@ impl Timeline {
|
||||
let compact_key_range = job.compact_key_range;
|
||||
let compact_lsn_range = job.compact_lsn_range;
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
|
||||
|
||||
scopeguard::defer! {
|
||||
@@ -2430,7 +2398,7 @@ impl Timeline {
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(job_desc.gc_cutoff);
|
||||
if debug_mode {
|
||||
if cfg!(debug_assertions) {
|
||||
assert_eq!(
|
||||
res,
|
||||
job_desc
|
||||
|
||||
@@ -301,7 +301,6 @@ impl DeleteTimelineFlow {
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: tenant.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
|
||||
},
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
|
||||
@@ -32,151 +32,54 @@
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! ## Data Structures
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
|
||||
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
|
||||
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
|
||||
//!
|
||||
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
|
||||
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
|
||||
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
|
||||
//!
|
||||
//! The `HandleInner` is allocated as a `Arc<Mutex<HandleInner>>` and
|
||||
//! referenced weakly and strongly from various places which we are now illustrating.
|
||||
//! For brevity, we will omit the `Arc<Mutex<>>` part in the following and instead
|
||||
//! use `strong ref` and `weak ref` when referring to the `Arc<Mutex<HandleInner>>`
|
||||
//! or `Weak<Mutex<HandleInner>>`, respectively.
|
||||
//!
|
||||
//! - The `Handle` is a strong ref.
|
||||
//! - The `WeakHandle` is a weak ref.
|
||||
//! - The `PerTimelineState` contains a `HashMap<CacheId, strong ref>`.
|
||||
//! - The `Cache` is a `HashMap<unique identifier for the shard, weak ref>`.
|
||||
//!
|
||||
//! Lifetimes:
|
||||
//! - `WeakHandle` and `Handle`: single pagestream request.
|
||||
//! - `Cache`: single page service connection.
|
||||
//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`).
|
||||
//!
|
||||
//! ## Request Handling Flow (= filling and using the `Cache``)
|
||||
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
|
||||
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
|
||||
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
|
||||
//! and a strong ref in the `PerTimelineState`.
|
||||
//! A strong ref is returned wrapped in a `Handle`.
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
|
||||
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
|
||||
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the weak ref in the cache.
|
||||
//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`.
|
||||
//! and find the `Weak<HandleInner>` in the cache.
|
||||
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
|
||||
//!
|
||||
//! The pagestream processing is pipelined and involves a batching step.
|
||||
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
|
||||
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
|
||||
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//!
|
||||
//! # Performance
|
||||
//! # Memory Management / How The Reference Cycle Is Broken
|
||||
//!
|
||||
//! Remember from the introductory section:
|
||||
//! The attentive reader may have noticed the strong reference cycle
|
||||
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
|
||||
//!
|
||||
//! > However, we want to avoid the overhead of entering the gate for every
|
||||
//! > method invocation.
|
||||
//!
|
||||
//! Why do we want to avoid that?
|
||||
//! Because the gate is a shared location in memory and entering it involves
|
||||
//! bumping refcounts, which leads to cache contention if done frequently
|
||||
//! from multiple cores in parallel.
|
||||
//!
|
||||
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
|
||||
//! That `Arc` is private to the `HandleInner` and hence to the connection.
|
||||
//! (Review the "Data Structures" section if that is unclear to you.)
|
||||
//!
|
||||
//! A `WeakHandle` is a weak ref to the `HandleInner`.
|
||||
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
|
||||
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
|
||||
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
|
||||
//!
|
||||
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
|
||||
//! Again, this is cheap because the `Arc` is private to the connection.
|
||||
//!
|
||||
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
|
||||
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
|
||||
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
|
||||
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
|
||||
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
|
||||
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
|
||||
//!
|
||||
//! # Shutdown
|
||||
//!
|
||||
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! Further, there is this cycle:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! The former cycle is a memory leak if not broken.
|
||||
//! The latter cycle further prevents the Timeline from shutting down
|
||||
//! because we certainly won't drop the Timeline while the GateGuard is alive.
|
||||
//! Preventing shutdown is the whole point of this handle/cache system,
|
||||
//! but when the Timeline needs to shut down, we need to break the cycle.
|
||||
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
|
||||
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
|
||||
//!
|
||||
//! The cycle is broken by either
|
||||
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
|
||||
//! - Connection shutdown (=> dropping the `Cache`).
|
||||
//! - `PerTimelineState::shutdown` or
|
||||
//! - dropping the `Cache`.
|
||||
//!
|
||||
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
|
||||
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
|
||||
//! `Arc<GateGuard>`.
|
||||
//!
|
||||
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
|
||||
//! thereby breaking the cycle.
|
||||
//! It also initiates draining of already existing `Handle`s by
|
||||
//! poisoning things so that no new `HandleInner`'s can be added
|
||||
//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail.
|
||||
//!
|
||||
//! Concurrently existing / already upgraded `Handle`s will extend the
|
||||
//! lifetime of the `Arc<Mutex<HandleInner>>` and hence cycles.
|
||||
//! Concurrently existing `Handle`s will extend the existence of the cycle.
|
||||
//! However, since `Handle`s are short-lived and new `Handle`s are not
|
||||
//! handed out from `Cache::get` or `WeakHandle::upgrade` after
|
||||
//! `PerTimelineState::shutdown`, that extension of the cycle is bounded.
|
||||
//!
|
||||
//! Concurrently existing `WeakHandle`s will fail to `upgrade()`:
|
||||
//! while they will succeed in upgrading `Weak<Mutex<HandleInner>>`,
|
||||
//! they will find the inner in state `HandleInner::ShutDown` state where the
|
||||
//! `Arc<GateGuard>` and Timeline has already been dropped.
|
||||
//!
|
||||
//! Dropping the `Cache` undoes the registration of this `Cache`'s
|
||||
//! `HandleInner`s from all the `PerTimelineState`s, i.e., it
|
||||
//! removes the strong ref to each of its `HandleInner`s
|
||||
//! from all the `PerTimelineState`.
|
||||
//!
|
||||
//! # Locking Rules
|
||||
//!
|
||||
//! To prevent deadlocks we:
|
||||
//!
|
||||
//! 1. Only ever hold one of the locks at a time.
|
||||
//! 2. Don't add more than one Drop impl that locks on the
|
||||
//! cycles above.
|
||||
//!
|
||||
//! As per (2), that impl is in `Drop for Cache`.
|
||||
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
|
||||
//! that extension of the cycle is bounded.
|
||||
//!
|
||||
//! # Fast Path for Shard Routing
|
||||
//!
|
||||
//! The `Cache` has a fast path for shard routing to avoid calling into
|
||||
//! the tenant manager for every request.
|
||||
//!
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s.
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
|
||||
//!
|
||||
//! The current implementation uses the first entry in the hash map
|
||||
//! to determine the `ShardParameters` and derive the correct
|
||||
@@ -184,18 +87,18 @@
|
||||
//!
|
||||
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
|
||||
//!
|
||||
//! If the lookup is successful and the `WeakHandle` can be upgraded,
|
||||
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
|
||||
//! it's a hit.
|
||||
//!
|
||||
//! ## Cache invalidation
|
||||
//!
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently if done lazily.
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
|
||||
//! The only reasons why an entry in the cache can become stale are:
|
||||
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
|
||||
//! being detached, timeline or shard deleted, or pageserver is shutting down.
|
||||
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
|
||||
//!
|
||||
//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the
|
||||
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
|
||||
//! timeline has shut down, and when that happens, we remove the entry from the cache.
|
||||
//!
|
||||
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
|
||||
@@ -204,6 +107,8 @@
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
@@ -247,7 +152,7 @@ pub(crate) struct Cache<T: Types> {
|
||||
map: Map<T>,
|
||||
}
|
||||
|
||||
type Map<T> = HashMap<ShardTimelineId, WeakHandle<T>>;
|
||||
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
|
||||
|
||||
impl<T: Types> Default for Cache<T> {
|
||||
fn default() -> Self {
|
||||
@@ -265,22 +170,12 @@ pub(crate) struct ShardTimelineId {
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types> {
|
||||
timeline: Arc<T::Timeline>,
|
||||
#[allow(dead_code)] // the field exists to keep the gate open
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
inner: Arc<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
pub(crate) struct WeakHandle<T: Types> {
|
||||
inner: Weak<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
enum HandleInner<T: Types> {
|
||||
KeepingTimelineGateOpen {
|
||||
#[allow(dead_code)]
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
timeline: Arc<T::Timeline>,
|
||||
},
|
||||
ShutDown,
|
||||
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
|
||||
struct HandleInner<T: Types> {
|
||||
shut_down: AtomicBool,
|
||||
timeline: T::Timeline,
|
||||
// The timeline's gate held open.
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
|
||||
@@ -288,8 +183,7 @@ enum HandleInner<T: Types> {
|
||||
/// See module-level comment for details.
|
||||
pub struct PerTimelineState<T: Types> {
|
||||
// None = shutting down
|
||||
#[allow(clippy::type_complexity)]
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<Mutex<HandleInner<T>>>>>>,
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
|
||||
}
|
||||
|
||||
impl<T: Types> Default for PerTimelineState<T> {
|
||||
@@ -349,24 +243,49 @@ impl<T: Types> Cache<T> {
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let miss: ShardSelector = loop {
|
||||
// terminates because each iteration removes an element from the map
|
||||
loop {
|
||||
let handle = self
|
||||
.get_impl(timeline_id, shard_selector, tenant_manager)
|
||||
.await?;
|
||||
if handle.0.shut_down.load(Ordering::Relaxed) {
|
||||
let removed = self
|
||||
.map
|
||||
.remove(&handle.0.timeline.shard_timeline_id())
|
||||
.expect("invariant of get_impl is that the returned handle is in the map");
|
||||
assert!(
|
||||
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
|
||||
"shard_timeline_id() incorrect?"
|
||||
);
|
||||
} else {
|
||||
return Ok(handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
async fn get_impl(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
let miss: ShardSelector = {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
match routing_state {
|
||||
RoutingResult::FastPath(handle) => return Ok(handle),
|
||||
RoutingResult::SlowPath(key) => match self.map.get(&key) {
|
||||
Some(cached) => match cached.upgrade() {
|
||||
Ok(upgraded) => return Ok(upgraded),
|
||||
Err(HandleUpgradeError::ShutDown) => {
|
||||
// TODO: dedup with shard_routing()
|
||||
Some(upgraded) => return Ok(Handle(upgraded)),
|
||||
None => {
|
||||
trace!("handle cache stale");
|
||||
self.map.remove(&key).unwrap();
|
||||
continue;
|
||||
ShardSelector::Known(key.shard_index)
|
||||
}
|
||||
},
|
||||
None => break ShardSelector::Known(key.shard_index),
|
||||
None => ShardSelector::Known(key.shard_index),
|
||||
},
|
||||
RoutingResult::NeedConsultTenantManager => break shard_selector,
|
||||
RoutingResult::NeedConsultTenantManager => shard_selector,
|
||||
}
|
||||
};
|
||||
self.get_miss(timeline_id, miss, tenant_manager).await
|
||||
@@ -383,7 +302,7 @@ impl<T: Types> Cache<T> {
|
||||
let Some((first_key, first_handle)) = self.map.iter().next() else {
|
||||
return RoutingResult::NeedConsultTenantManager;
|
||||
};
|
||||
let Ok(first_handle) = first_handle.upgrade() else {
|
||||
let Some(first_handle) = first_handle.upgrade() else {
|
||||
// TODO: dedup with get()
|
||||
trace!("handle cache stale");
|
||||
let first_key_owned = *first_key;
|
||||
@@ -391,7 +310,7 @@ impl<T: Types> Cache<T> {
|
||||
continue;
|
||||
};
|
||||
|
||||
let first_handle_shard_identity = first_handle.get_shard_identity();
|
||||
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
|
||||
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
|
||||
shard_number: shard_num,
|
||||
shard_count: first_handle_shard_identity.count,
|
||||
@@ -410,11 +329,11 @@ impl<T: Types> Cache<T> {
|
||||
};
|
||||
let first_handle_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: first_handle_shard_identity.shard_index(),
|
||||
timeline_id: first_handle.shard_timeline_id().timeline_id,
|
||||
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
|
||||
};
|
||||
|
||||
if need_shard_timeline_id == first_handle_shard_timeline_id {
|
||||
return RoutingResult::FastPath(first_handle);
|
||||
return RoutingResult::FastPath(Handle(first_handle));
|
||||
} else {
|
||||
return RoutingResult::SlowPath(need_shard_timeline_id);
|
||||
}
|
||||
@@ -438,30 +357,23 @@ impl<T: Types> Cache<T> {
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
trace!("creating new HandleInner");
|
||||
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
|
||||
gate_guard: Arc::new(
|
||||
// this enter() is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::gate refcounts
|
||||
match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
},
|
||||
),
|
||||
// this clone is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::clone refcounts
|
||||
timeline: Arc::new(timeline.clone()),
|
||||
}));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
let gate_guard = match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
};
|
||||
let handle = handle_weak
|
||||
.upgrade()
|
||||
.ok()
|
||||
.expect("we just created it and it's not linked anywhere yet");
|
||||
{
|
||||
trace!("creating new HandleInner");
|
||||
let handle = Arc::new(
|
||||
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
|
||||
// so we can identify reference cycle bugs.
|
||||
HandleInner {
|
||||
shut_down: AtomicBool::new(false),
|
||||
_gate_guard: gate_guard,
|
||||
timeline: timeline.clone(),
|
||||
},
|
||||
);
|
||||
let handle = {
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
@@ -469,8 +381,7 @@ impl<T: Types> Cache<T> {
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced =
|
||||
per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
@@ -481,7 +392,8 @@ impl<T: Types> Cache<T> {
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(handle_weak);
|
||||
v.insert(Arc::downgrade(&handle));
|
||||
handle
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -489,62 +401,14 @@ impl<T: Types> Cache<T> {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(handle)
|
||||
};
|
||||
Ok(Handle(handle))
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum HandleUpgradeError {
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
impl<T: Types> WeakHandle<T> {
|
||||
pub(crate) fn upgrade(&self) -> Result<Handle<T>, HandleUpgradeError> {
|
||||
let Some(inner) = Weak::upgrade(&self.inner) else {
|
||||
return Err(HandleUpgradeError::ShutDown);
|
||||
};
|
||||
let lock_guard = inner.lock().expect("poisoned");
|
||||
match &*lock_guard {
|
||||
HandleInner::KeepingTimelineGateOpen {
|
||||
timeline,
|
||||
gate_guard,
|
||||
} => {
|
||||
let gate_guard = Arc::clone(gate_guard);
|
||||
let timeline = Arc::clone(timeline);
|
||||
drop(lock_guard);
|
||||
Ok(Handle {
|
||||
timeline,
|
||||
gate_guard,
|
||||
inner,
|
||||
})
|
||||
}
|
||||
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_same_handle_as(&self, other: &WeakHandle<T>) -> bool {
|
||||
Weak::ptr_eq(&self.inner, &other.inner)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> Handle<T> {
|
||||
pub(crate) fn downgrade(&self) -> WeakHandle<T> {
|
||||
WeakHandle {
|
||||
inner: Arc::downgrade(&self.inner),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> PerTimelineState<T> {
|
||||
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
@@ -566,62 +430,43 @@ impl<T: Types> PerTimelineState<T> {
|
||||
trace!("already shut down");
|
||||
return;
|
||||
};
|
||||
for handle_inner_arc in handles.values() {
|
||||
for handle in handles.values() {
|
||||
// Make hits fail.
|
||||
let mut lock_guard = handle_inner_arc.lock().expect("poisoned");
|
||||
lock_guard.shutdown();
|
||||
handle.shut_down.store(true, Ordering::Relaxed);
|
||||
}
|
||||
drop(handles);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<T: Types> Drop for HandleInner<T> {
|
||||
fn drop(&mut self) {
|
||||
trace!("HandleInner dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
|
||||
impl<T: Types> Drop for Cache<T> {
|
||||
fn drop(&mut self) {
|
||||
for (
|
||||
_,
|
||||
WeakHandle {
|
||||
inner: handle_inner_weak,
|
||||
},
|
||||
) in self.map.drain()
|
||||
{
|
||||
let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
|
||||
continue;
|
||||
};
|
||||
let Some(handle_timeline) = handle_inner_arc
|
||||
// locking rules: drop lock before acquiring other lock below
|
||||
.lock()
|
||||
.expect("poisoned")
|
||||
.shutdown()
|
||||
else {
|
||||
// Concurrent PerTimelineState::shutdown.
|
||||
continue;
|
||||
};
|
||||
// Clean up per_timeline_state so the HandleInner allocation can be dropped.
|
||||
let per_timeline_state = handle_timeline.per_timeline_state();
|
||||
let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
|
||||
let Some(handles) = &mut *handles_lock_guard else {
|
||||
continue;
|
||||
};
|
||||
let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
|
||||
// Concurrent PerTimelineState::shutdown.
|
||||
continue;
|
||||
};
|
||||
drop(handles_lock_guard); // locking rules!
|
||||
assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> HandleInner<T> {
|
||||
fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
|
||||
match std::mem::replace(self, HandleInner::ShutDown) {
|
||||
HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
|
||||
HandleInner::ShutDown => {
|
||||
// Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
|
||||
// may do it concurrently, but locking rules disallow holding per-timeline-state lock and
|
||||
// the handle lock at the same time.
|
||||
None
|
||||
for (_, weak) in self.map.drain() {
|
||||
if let Some(strong) = weak.upgrade() {
|
||||
// handle is still being kept alive in PerTimelineState
|
||||
let timeline = strong.timeline.per_timeline_state();
|
||||
let mut handles = timeline.handles.lock().expect("mutex poisoned");
|
||||
if let Some(handles) = &mut *handles {
|
||||
let Some(removed) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
assert!(Arc::ptr_eq(&removed, &strong));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -629,8 +474,6 @@ impl<T: Types> HandleInner<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Weak;
|
||||
|
||||
use pageserver_api::{
|
||||
key::{rel_block_to_key, Key, DBDIR_KEY},
|
||||
models::ShardParameters,
|
||||
@@ -740,13 +583,39 @@ mod tests {
|
||||
//
|
||||
// fill the cache
|
||||
//
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
let handle: Handle<_> = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
let handle_inner_weak = Arc::downgrade(&handle.0);
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
assert_eq!(
|
||||
(
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
Weak::weak_count(&handle_inner_weak)
|
||||
),
|
||||
(2, 2),
|
||||
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
|
||||
);
|
||||
assert_eq!(cache.map.len(), 1);
|
||||
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
//
|
||||
// demonstrate that Handle holds up gate closure
|
||||
@@ -771,11 +640,21 @@ mod tests {
|
||||
// SHUTDOWN
|
||||
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"through local var handle"
|
||||
);
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
1,
|
||||
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(via handle), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// this handle is perfectly usable
|
||||
handle.getpage();
|
||||
@@ -799,6 +678,16 @@ mod tests {
|
||||
}
|
||||
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
0,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"the HandleInner destructor already ran"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// closing gate succeeds after dropping handle
|
||||
tokio::select! {
|
||||
@@ -817,8 +706,10 @@ mod tests {
|
||||
assert_eq!(cache.map.len(), 0);
|
||||
|
||||
// ensure all refs to shard0 are gone and we're not leaking anything
|
||||
let myself = Weak::clone(&shard0.myself);
|
||||
drop(shard0);
|
||||
drop(mgr);
|
||||
assert_eq!(Weak::strong_count(&myself), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -1057,11 +948,15 @@ mod tests {
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.timeline));
|
||||
used_handles.push(Arc::downgrade(&handle.0));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown.
|
||||
// Thus the gate should close immediately, even without shutdown.
|
||||
// No handles exist, thus gates are closed and don't require shutdown
|
||||
assert!(used_handles
|
||||
.iter()
|
||||
.all(|weak| Weak::strong_count(weak) == 0));
|
||||
|
||||
// ... thus the gate should close immediately, even without shutdown
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
@@ -1069,75 +964,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_weak_handles() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
|
||||
let refcount_start = Arc::strong_count(&shard0);
|
||||
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
|
||||
let weak_handle = handle.downgrade();
|
||||
|
||||
drop(handle);
|
||||
|
||||
let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it");
|
||||
|
||||
// Start shutdown
|
||||
shard0.per_timeline_state.shutdown();
|
||||
|
||||
// Upgrades during shutdown don't work, even if upgraded_handle exists.
|
||||
weak_handle
|
||||
.upgrade()
|
||||
.err()
|
||||
.expect("can't upgrade weak handle as soon as shutdown started");
|
||||
|
||||
// But upgraded_handle is still alive, so the gate won't close.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
|
||||
// Drop the last handle.
|
||||
drop(upgraded_handle);
|
||||
|
||||
// The gate should close now, despite there still being a weak_handle.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("only strong handle is dropped and we shut down per-timeline-state")
|
||||
}
|
||||
}
|
||||
|
||||
// The weak handle still can't be upgraded.
|
||||
weak_handle
|
||||
.upgrade()
|
||||
.err()
|
||||
.expect("still shouldn't be able to upgrade the weak handle");
|
||||
|
||||
// There should be no strong references to the timeline object except the one on "stack".
|
||||
assert_eq!(Arc::strong_count(&shard0), refcount_start);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let (replication_client, connection) = {
|
||||
let mut config = wal_source_connconf.to_tokio_postgres_config();
|
||||
config.application_name(format!("pageserver-{}", node.0).as_str());
|
||||
config.application_name("pageserver");
|
||||
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
|
||||
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
|
||||
Ok(client_and_conn) => client_and_conn?,
|
||||
@@ -264,8 +264,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
let interpreted_proto_config = match protocol {
|
||||
PostgresClientProtocol::Vanilla => None,
|
||||
PostgresClientProtocol::Interpreted {
|
||||
@@ -478,12 +476,10 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard,
|
||||
modification.tline.get_shard_identity(),
|
||||
next_record_lsn,
|
||||
modification.tline.pg_version,
|
||||
)?
|
||||
.remove(timeline.get_shard_identity())
|
||||
.unwrap();
|
||||
)?;
|
||||
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2163,12 +2163,10 @@ mod tests {
|
||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&[*modification.tline.get_shard_identity()],
|
||||
modification.tline.get_shard_identity(),
|
||||
lsn,
|
||||
modification.tline.pg_version,
|
||||
)
|
||||
.unwrap()
|
||||
.remove(modification.tline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
walingest
|
||||
|
||||
@@ -911,74 +911,7 @@ pageserver_receive(shardno_t shard_no)
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
|
||||
shard->nresponses_received++;
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
|
||||
static NeonResponse *
|
||||
pageserver_try_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
if (shard->state != PS_Connected)
|
||||
return NULL;
|
||||
|
||||
Assert(pageserver_conn);
|
||||
|
||||
rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */);
|
||||
|
||||
if (rc == 0)
|
||||
return NULL;
|
||||
else if (rc > 0)
|
||||
{
|
||||
PG_TRY();
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
@@ -1047,7 +980,6 @@ page_server_api api =
|
||||
.send = pageserver_send,
|
||||
.flush = pageserver_flush,
|
||||
.receive = pageserver_receive,
|
||||
.try_receive = pageserver_try_receive,
|
||||
.disconnect = pageserver_disconnect_shard
|
||||
};
|
||||
|
||||
|
||||
@@ -34,8 +34,6 @@ typedef enum
|
||||
T_NeonGetPageRequest,
|
||||
T_NeonDbSizeRequest,
|
||||
T_NeonGetSlruSegmentRequest,
|
||||
/* future tags above this line */
|
||||
T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */
|
||||
|
||||
/* pagestore -> pagestore_client */
|
||||
T_NeonExistsResponse = 100,
|
||||
@@ -44,8 +42,6 @@ typedef enum
|
||||
T_NeonErrorResponse,
|
||||
T_NeonDbSizeResponse,
|
||||
T_NeonGetSlruSegmentResponse,
|
||||
/* future tags above this line */
|
||||
T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */
|
||||
} NeonMessageTag;
|
||||
|
||||
typedef uint64 NeonRequestId;
|
||||
@@ -196,29 +192,9 @@ typedef uint16 shardno_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/*
|
||||
* Send this request to the PageServer associated with this shard.
|
||||
*/
|
||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||
/*
|
||||
* Blocking read for the next response of this shard.
|
||||
*
|
||||
* When a CANCEL signal is handled, the connection state will be
|
||||
* unmodified.
|
||||
*/
|
||||
NeonResponse *(*receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Try get the next response from the TCP buffers, if any.
|
||||
* Returns NULL when the data is not yet available.
|
||||
*/
|
||||
NeonResponse *(*try_receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Make sure all requests are sent to PageServer.
|
||||
*/
|
||||
bool (*flush) (shardno_t shard_no);
|
||||
/*
|
||||
* Disconnect from this pageserver shard.
|
||||
*/
|
||||
void (*disconnect) (shardno_t shard_no);
|
||||
} page_server_api;
|
||||
|
||||
|
||||
@@ -405,56 +405,6 @@ compact_prefetch_buffers(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there might be responses still in the TCP buffer, then
|
||||
* we should try to use those, so as to reduce any TCP backpressure
|
||||
* on the OS/PS side.
|
||||
*
|
||||
* This procedure handles that.
|
||||
*
|
||||
* Note that this is only valid as long as the only pipelined
|
||||
* operations in the TCP buffer are getPage@Lsn requests.
|
||||
*/
|
||||
static void
|
||||
prefetch_pump_state(void)
|
||||
{
|
||||
while (MyPState->ring_receive != MyPState->ring_flush)
|
||||
{
|
||||
NeonResponse *response;
|
||||
PrefetchRequest *slot;
|
||||
MemoryContext old;
|
||||
|
||||
slot = GetPrfSlot(MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = page_server->try_receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
|
||||
if (response == NULL)
|
||||
break;
|
||||
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
MyPState->ring_receive += 1;
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_RECEIVED;
|
||||
slot->response = response;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
readahead_buffer_resize(int newsize, void *extra)
|
||||
{
|
||||
@@ -2858,8 +2808,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
MyPState->ring_last <= ring_index);
|
||||
}
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2901,8 +2849,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
Assert(ring_index < MyPState->ring_unused &&
|
||||
MyPState->ring_last <= ring_index);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif /* PG_MAJORVERSION_NUM < 17 */
|
||||
@@ -2945,8 +2891,6 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
*/
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwriteback(reln, forknum, blocknum, nblocks);
|
||||
@@ -3201,8 +3145,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3340,8 +3282,6 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
|
||||
buffers, nblocks, read);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3510,8 +3450,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
@@ -3565,8 +3503,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
|
||||
@@ -3856,8 +3792,6 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdimmedsync(reln, forknum);
|
||||
|
||||
@@ -187,6 +187,10 @@ pub async fn worker(
|
||||
let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
|
||||
let rx = rx.map(RequestData::from);
|
||||
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?;
|
||||
|
||||
let properties = WriterProperties::builder()
|
||||
.set_data_page_size_limit(config.parquet_upload_page_size)
|
||||
.set_compression(config.parquet_upload_compression);
|
||||
@@ -220,18 +224,18 @@ pub async fn worker(
|
||||
let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
|
||||
let rx_disconnect = rx_disconnect.map(RequestData::from);
|
||||
|
||||
let storage_disconnect =
|
||||
GenericRemoteStorage::from_config(&disconnect_events_storage_config)
|
||||
.await
|
||||
.context("remote storage for disconnect events init")?;
|
||||
let parquet_config_disconnect = parquet_config.clone();
|
||||
tokio::try_join!(
|
||||
worker_inner(remote_storage_config, rx, parquet_config),
|
||||
worker_inner(
|
||||
disconnect_events_storage_config,
|
||||
rx_disconnect,
|
||||
parquet_config_disconnect
|
||||
)
|
||||
worker_inner(storage, rx, parquet_config),
|
||||
worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
|
||||
)
|
||||
.map(|_| ())
|
||||
} else {
|
||||
worker_inner(remote_storage_config, rx, parquet_config).await
|
||||
worker_inner(storage, rx, parquet_config).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -247,32 +251,18 @@ struct ParquetConfig {
|
||||
test_remote_failures: u64,
|
||||
}
|
||||
|
||||
impl ParquetConfig {
|
||||
async fn storage(
|
||||
&self,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let storage = GenericRemoteStorage::from_config(storage_config)
|
||||
.await
|
||||
.context("remote storage init")?;
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
if self.test_remote_failures > 0 {
|
||||
return Ok(GenericRemoteStorage::unreliable_wrapper(
|
||||
storage,
|
||||
self.test_remote_failures,
|
||||
));
|
||||
}
|
||||
|
||||
Ok(storage)
|
||||
}
|
||||
}
|
||||
|
||||
async fn worker_inner(
|
||||
storage_config: RemoteStorageConfig,
|
||||
storage: GenericRemoteStorage,
|
||||
rx: impl Stream<Item = RequestData>,
|
||||
config: ParquetConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
let storage = if config.test_remote_failures > 0 {
|
||||
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
|
||||
} else {
|
||||
storage
|
||||
};
|
||||
|
||||
let mut rx = std::pin::pin!(rx);
|
||||
|
||||
let mut rows = Vec::with_capacity(config.rows_per_group);
|
||||
@@ -295,7 +285,7 @@ async fn worker_inner(
|
||||
}
|
||||
if len > config.file_size || force {
|
||||
last_upload = time::Instant::now();
|
||||
let file = upload_parquet(w, len, &storage_config, &config).await?;
|
||||
let file = upload_parquet(w, len, &storage).await?;
|
||||
w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
|
||||
len = 0;
|
||||
}
|
||||
@@ -308,7 +298,7 @@ async fn worker_inner(
|
||||
}
|
||||
|
||||
if !w.flushed_row_groups().is_empty() {
|
||||
let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage_config, &config).await?;
|
||||
let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -350,8 +340,7 @@ where
|
||||
async fn upload_parquet(
|
||||
mut w: SerializedFileWriter<Writer<BytesMut>>,
|
||||
len: i64,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
config: &ParquetConfig,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<Writer<BytesMut>> {
|
||||
let len_uncompressed = w
|
||||
.flushed_row_groups()
|
||||
@@ -388,15 +377,6 @@ async fn upload_parquet(
|
||||
size, compression, "uploading request parquet file"
|
||||
);
|
||||
|
||||
// A bug in azure-sdk means that the identity-token-file that expires after
|
||||
// 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage
|
||||
// tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh
|
||||
// the storage token, but the identity token has now expired.
|
||||
// <https://github.com/Azure/azure-sdk-for-rust/issues/1739>
|
||||
//
|
||||
// To work around this, we recreate the storage every time.
|
||||
let storage = config.storage(storage_config).await?;
|
||||
|
||||
let year = now.year();
|
||||
let month = now.month();
|
||||
let day = now.day();
|
||||
@@ -451,8 +431,8 @@ mod tests {
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use remote_storage::{
|
||||
RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::time;
|
||||
@@ -579,11 +559,12 @@ mod tests {
|
||||
timeout: std::time::Duration::from_secs(120),
|
||||
small_timeout: std::time::Duration::from_secs(30),
|
||||
};
|
||||
|
||||
worker_inner(remote_storage_config, rx, config)
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
worker_inner(storage, rx, config).await.unwrap();
|
||||
|
||||
let mut files = WalkDir::new(tmpdir.as_std_path())
|
||||
.into_iter()
|
||||
.filter_map(|entry| entry.ok())
|
||||
|
||||
@@ -26,7 +26,6 @@ hex.workspace = true
|
||||
humantime.workspace = true
|
||||
http.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
futures.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
@@ -40,7 +39,6 @@ scopeguard.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
smallvec.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
@@ -65,7 +63,6 @@ storage_broker.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
utils.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -207,13 +207,6 @@ struct Args {
|
||||
/// Also defines interval for eviction retries.
|
||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
|
||||
eviction_min_resident: Duration,
|
||||
/// Enable fanning out WAL to different shards from the same reader
|
||||
#[arg(long)]
|
||||
wal_reader_fanout: bool,
|
||||
/// Only fan out the WAL reader if the absoulte delta between the new requested position
|
||||
/// and the current position of the reader is smaller than this value.
|
||||
#[arg(long)]
|
||||
max_delta_for_fanout: Option<u64>,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
@@ -377,8 +370,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
control_file_save_interval: args.control_file_save_interval,
|
||||
partial_backup_concurrency: args.partial_backup_concurrency,
|
||||
eviction_min_resident: args.eviction_min_resident,
|
||||
wal_reader_fanout: args.wal_reader_fanout,
|
||||
max_delta_for_fanout: args.max_delta_for_fanout,
|
||||
});
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use utils::crashsafe::durable_rename;
|
||||
@@ -14,14 +13,14 @@ use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::control_file_upgrade::downgrade_v10_to_v9;
|
||||
use crate::control_file_upgrade::downgrade_v9_to_v8;
|
||||
use crate::control_file_upgrade::upgrade_control_file;
|
||||
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
|
||||
use crate::state::{EvictionState, TimelinePersistentState};
|
||||
use utils::bin_ser::LeSer;
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 10;
|
||||
pub const SK_FORMAT_VERSION: u32 = 9;
|
||||
|
||||
// contains persistent metadata for safekeeper
|
||||
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||
@@ -170,11 +169,10 @@ impl TimelinePersistentState {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
|
||||
|
||||
if self.mconf.generation == INVALID_GENERATION {
|
||||
// Temp hack for forward compatibility test: in case of none
|
||||
// configuration save cfile in previous v9 format.
|
||||
const PREV_FORMAT_VERSION: u32 = 9;
|
||||
let prev = downgrade_v10_to_v9(self);
|
||||
if self.eviction_state == EvictionState::Present {
|
||||
// temp hack for forward compatibility
|
||||
const PREV_FORMAT_VERSION: u32 = 8;
|
||||
let prev = downgrade_v9_to_v8(self);
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
|
||||
prev.ser_into(&mut buf)?;
|
||||
} else {
|
||||
@@ -235,7 +233,6 @@ impl Storage for FileStorage {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use safekeeper_api::membership::{Configuration, MemberSet};
|
||||
use tokio::fs;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -245,11 +242,6 @@ mod test {
|
||||
async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
|
||||
let tempdir = camino_tempfile::tempdir()?;
|
||||
let mut state = TimelinePersistentState::empty();
|
||||
state.mconf = Configuration {
|
||||
generation: 42,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
};
|
||||
let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
|
||||
|
||||
// Make a change.
|
||||
|
||||
@@ -1,22 +1,17 @@
|
||||
//! Code to deal with safekeeper control file upgrades
|
||||
use std::vec;
|
||||
|
||||
use crate::{
|
||||
safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
|
||||
state::{EvictionState, TimelinePersistentState},
|
||||
state::{EvictionState, PersistedPeers, TimelinePersistentState},
|
||||
wal_backup_partial,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
use pq_proto::SystemId;
|
||||
use safekeeper_api::{
|
||||
membership::{Configuration, INVALID_GENERATION},
|
||||
ServerInfo, Term,
|
||||
};
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::LeSer,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
@@ -238,90 +233,6 @@ pub struct SafeKeeperStateV8 {
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeerInfo {
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Term of the last entry.
|
||||
pub term: Term,
|
||||
/// LSN of the last record.
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
pub commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PersistedPeerInfo {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
backup_lsn: Lsn::INVALID,
|
||||
term: safekeeper_api::INITIAL_TERM,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make clippy happy
|
||||
impl Default for PersistedPeerInfo {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct TimelinePersistentStateV9 {
|
||||
#[serde(with = "hex")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealt with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// Since which LSN this timeline generally starts. Safekeeper might have
|
||||
/// joined later.
|
||||
pub timeline_start_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has (had) WAL for this timeline.
|
||||
/// All WAL segments next to one containing local_start_lsn are
|
||||
/// filled with data from the beginning.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// Part of WAL acknowledged by quorum *and available locally*. Always points
|
||||
/// to record boundary.
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN that points to the end of the last backed up segment. Useful to
|
||||
/// persist to avoid finding out offloading progress on boot.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone). Persisting it helps skipping
|
||||
/// recovery in walproposer, generally we compute it from peers. In
|
||||
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
|
||||
/// only by walproposer.
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
/// Eviction state of the timeline. If it's Offloaded, we should download
|
||||
/// WAL files from remote storage to serve the timeline.
|
||||
pub eviction_state: EvictionState,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
|
||||
// migrate to storing full term history
|
||||
if version == 1 {
|
||||
@@ -337,7 +248,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: ac,
|
||||
server: ServerInfo {
|
||||
pg_version: oldstate.server.pg_version,
|
||||
@@ -351,9 +261,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to hexing some ids
|
||||
} else if version == 2 {
|
||||
@@ -367,7 +277,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -377,9 +286,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
||||
} else if version == 3 {
|
||||
@@ -393,7 +302,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -403,9 +311,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to having timeline_start_lsn
|
||||
} else if version == 4 {
|
||||
@@ -419,7 +327,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -429,9 +336,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn::INVALID,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 5 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
@@ -465,7 +372,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -475,9 +381,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 8 {
|
||||
let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
|
||||
@@ -485,7 +391,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -495,28 +400,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: oldstate.partial_backup,
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 9 {
|
||||
let oldstate = TimelinePersistentStateV9::des(&buf[..buf.len()])?;
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: oldstate.timeline_start_lsn,
|
||||
local_start_lsn: oldstate.local_start_lsn,
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
partial_backup: oldstate.partial_backup,
|
||||
eviction_state: oldstate.eviction_state,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -526,11 +412,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
|
||||
// Used as a temp hack to make forward compatibility test work. Should be
|
||||
// removed after PR adding v10 is merged.
|
||||
pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 {
|
||||
assert!(state.mconf.generation == INVALID_GENERATION);
|
||||
TimelinePersistentStateV9 {
|
||||
pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
|
||||
assert!(state.eviction_state == EvictionState::Present);
|
||||
SafeKeeperStateV8 {
|
||||
tenant_id: state.tenant_id,
|
||||
timeline_id: state.timeline_id,
|
||||
acceptor_state: state.acceptor_state.clone(),
|
||||
@@ -542,9 +426,8 @@ pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersisten
|
||||
backup_lsn: state.backup_lsn,
|
||||
peer_horizon_lsn: state.peer_horizon_lsn,
|
||||
remote_consistent_lsn: state.remote_consistent_lsn,
|
||||
peers: PersistedPeers(vec![]),
|
||||
peers: state.peers.clone(),
|
||||
partial_backup: state.partial_backup.clone(),
|
||||
eviction_state: state.eviction_state,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -554,7 +437,7 @@ mod tests {
|
||||
|
||||
use utils::{id::NodeId, Hex};
|
||||
|
||||
use crate::control_file_upgrade::PersistedPeerInfo;
|
||||
use crate::safekeeper::PersistedPeerInfo;
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use anyhow::{bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use std::sync::Arc;
|
||||
use tokio::{
|
||||
fs::OpenOptions,
|
||||
@@ -148,10 +147,10 @@ pub async fn handle_request(
|
||||
|
||||
let mut new_state = TimelinePersistentState::new(
|
||||
&request.destination_ttid,
|
||||
Configuration::empty(),
|
||||
state.server.clone(),
|
||||
start_lsn,
|
||||
vec![],
|
||||
request.until_lsn,
|
||||
start_lsn,
|
||||
)?;
|
||||
new_state.timeline_start_lsn = start_lsn;
|
||||
new_state.peer_horizon_lsn = request.until_lsn;
|
||||
|
||||
@@ -52,70 +52,16 @@ pub struct SafekeeperPostgresHandler {
|
||||
|
||||
/// Parsed Postgres command.
|
||||
enum SafekeeperPostgresCommand {
|
||||
StartWalPush {
|
||||
proto_version: u32,
|
||||
// Eventually timelines will be always created explicitly by storcon.
|
||||
// This option allows legacy behaviour for compute to do that until we
|
||||
// fully migrate.
|
||||
allow_timeline_creation: bool,
|
||||
},
|
||||
StartReplication {
|
||||
start_lsn: Lsn,
|
||||
term: Option<Term>,
|
||||
},
|
||||
StartWalPush,
|
||||
StartReplication { start_lsn: Lsn, term: Option<Term> },
|
||||
IdentifySystem,
|
||||
TimelineStatus,
|
||||
JSONCtrl {
|
||||
cmd: AppendLogicalMessage,
|
||||
},
|
||||
JSONCtrl { cmd: AppendLogicalMessage },
|
||||
}
|
||||
|
||||
fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
if cmd.starts_with("START_WAL_PUSH") {
|
||||
// Allow additional options in postgres START_REPLICATION style like
|
||||
// START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false').
|
||||
// Parsing here is very naive and breaks in case of commas or
|
||||
// whitespaces in values, but enough for our purposes.
|
||||
let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap();
|
||||
let caps = re
|
||||
.captures(cmd)
|
||||
.context(format!("failed to parse START_WAL_PUSH command {}", cmd))?;
|
||||
// capture () content
|
||||
let options = caps.get(2).map(|m| m.as_str()).unwrap_or("");
|
||||
// default values
|
||||
let mut proto_version = 2;
|
||||
let mut allow_timeline_creation = true;
|
||||
for kvstr in options.split(",") {
|
||||
if kvstr.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let mut kvit = kvstr.split_whitespace();
|
||||
let key = kvit.next().context(format!(
|
||||
"failed to parse key in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value = kvit.next().context(format!(
|
||||
"failed to parse value in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value_trimmed = value.trim_matches('\'');
|
||||
if key == "proto_version" {
|
||||
proto_version = value_trimmed.parse::<u32>().context(format!(
|
||||
"failed to parse proto_version value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
if key == "allow_timeline_creation" {
|
||||
allow_timeline_creation = value_trimmed.parse::<bool>().context(format!(
|
||||
"failed to parse allow_timeline_creation value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
}
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
})
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
let re = Regex::new(
|
||||
// We follow postgres START_REPLICATION LOGICAL options to pass term.
|
||||
@@ -149,7 +95,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
|
||||
fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
|
||||
SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
|
||||
SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
|
||||
@@ -347,11 +293,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation)
|
||||
SafekeeperPostgresCommand::StartWalPush => {
|
||||
self.handle_start_wal_push(pgb)
|
||||
.instrument(info_span!("WAL receiver"))
|
||||
.await
|
||||
}
|
||||
@@ -524,39 +467,3 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SafekeeperPostgresCommand;
|
||||
|
||||
/// Test parsing of START_WAL_PUSH command
|
||||
#[test]
|
||||
fn test_start_wal_push_parse() {
|
||||
let cmd = "START_WAL_PUSH";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 2);
|
||||
assert!(allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
|
||||
let cmd =
|
||||
"START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 3);
|
||||
assert!(!allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use safekeeper_api::models;
|
||||
use safekeeper_api::models::AcceptorStateStatus;
|
||||
use safekeeper_api::models::SafekeeperStatus;
|
||||
use safekeeper_api::models::TermSwitchApiEntry;
|
||||
@@ -112,15 +111,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
system_id: request_data.system_id.unwrap_or(0),
|
||||
wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
|
||||
};
|
||||
let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
|
||||
request_data
|
||||
.commit_lsn
|
||||
.segment_lsn(server_info.wal_seg_size as usize)
|
||||
});
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
global_timelines
|
||||
.create(
|
||||
ttid,
|
||||
request_data.mconf,
|
||||
server_info,
|
||||
request_data.start_lsn,
|
||||
request_data.commit_lsn.unwrap_or(request_data.start_lsn),
|
||||
)
|
||||
.create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -184,7 +182,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let status = TimelineStatus {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
mconf: state.mconf,
|
||||
acceptor_state: acc_state,
|
||||
pg_info: state.server,
|
||||
flush_lsn,
|
||||
@@ -195,7 +192,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
remote_consistent_lsn: inmem.remote_consistent_lsn,
|
||||
peers: tli.get_peers(conf).await,
|
||||
walsenders: tli.get_walsenders().get_all_public(),
|
||||
walsenders: tli.get_walsenders().get_all(),
|
||||
walreceivers: tli.get_walreceivers().get_all(),
|
||||
};
|
||||
json_response(StatusCode::OK, status)
|
||||
@@ -270,28 +267,6 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Consider switching timeline membership configuration to the provided one.
|
||||
async fn timeline_membership_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?;
|
||||
let response = tli
|
||||
.membership_switch(data.mconf)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
@@ -643,10 +618,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
|
||||
|r| request_span(r, timeline_snapshot_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/membership",
|
||||
|r| request_span(r, timeline_membership_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|
||||
|r| request_span(r, timeline_copy_handler),
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
use anyhow::Context;
|
||||
use postgres_backend::QueryError;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -106,7 +105,6 @@ async fn prepare_safekeeper(
|
||||
.global_timelines
|
||||
.create(
|
||||
spg.ttid,
|
||||
Configuration::empty(),
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
|
||||
@@ -108,8 +108,6 @@ pub struct SafeKeeperConf {
|
||||
pub control_file_save_interval: Duration,
|
||||
pub partial_backup_concurrency: usize,
|
||||
pub eviction_min_resident: Duration,
|
||||
pub wal_reader_fanout: bool,
|
||||
pub max_delta_for_fanout: Option<u64>,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -152,8 +150,6 @@ impl SafeKeeperConf {
|
||||
control_file_save_interval: Duration::from_secs(1),
|
||||
partial_backup_concurrency: 1,
|
||||
eviction_min_resident: Duration::ZERO,
|
||||
wal_reader_fanout: false,
|
||||
max_delta_for_fanout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,9 +12,9 @@ use metrics::{
|
||||
pow2_buckets,
|
||||
proto::MetricFamily,
|
||||
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
|
||||
register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
|
||||
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
|
||||
Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
|
||||
IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_ffi::XLogSegNo;
|
||||
@@ -211,14 +211,6 @@ pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_receivers")
|
||||
});
|
||||
pub static WAL_READERS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"safekeeper_wal_readers",
|
||||
"Number of active WAL readers (may serve pageservers or other safekeepers)",
|
||||
&["kind", "target"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_receivers")
|
||||
});
|
||||
pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
|
||||
// Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
|
||||
// full queues respectively.
|
||||
@@ -451,7 +443,6 @@ pub struct FullTimelineInfo {
|
||||
pub timeline_is_active: bool,
|
||||
pub num_computes: u32,
|
||||
pub last_removed_segno: XLogSegNo,
|
||||
pub interpreted_wal_reader_tasks: usize,
|
||||
|
||||
pub epoch_start_lsn: Lsn,
|
||||
pub mem_state: TimelineMemState,
|
||||
@@ -481,7 +472,6 @@ pub struct TimelineCollector {
|
||||
disk_usage: GenericGaugeVec<AtomicU64>,
|
||||
acceptor_term: GenericGaugeVec<AtomicU64>,
|
||||
written_wal_bytes: GenericGaugeVec<AtomicU64>,
|
||||
interpreted_wal_reader_tasks: GenericGaugeVec<AtomicU64>,
|
||||
written_wal_seconds: GaugeVec,
|
||||
flushed_wal_seconds: GaugeVec,
|
||||
collect_timeline_metrics: Gauge,
|
||||
@@ -680,16 +670,6 @@ impl TimelineCollector {
|
||||
.unwrap();
|
||||
descs.extend(active_timelines_count.desc().into_iter().cloned());
|
||||
|
||||
let interpreted_wal_reader_tasks = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_interpreted_wal_reader_tasks",
|
||||
"Number of active interpreted wal reader tasks, grouped by timeline",
|
||||
),
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned());
|
||||
|
||||
TimelineCollector {
|
||||
global_timelines,
|
||||
descs,
|
||||
@@ -713,7 +693,6 @@ impl TimelineCollector {
|
||||
collect_timeline_metrics,
|
||||
timelines_count,
|
||||
active_timelines_count,
|
||||
interpreted_wal_reader_tasks,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -742,7 +721,6 @@ impl Collector for TimelineCollector {
|
||||
self.disk_usage.reset();
|
||||
self.acceptor_term.reset();
|
||||
self.written_wal_bytes.reset();
|
||||
self.interpreted_wal_reader_tasks.reset();
|
||||
self.written_wal_seconds.reset();
|
||||
self.flushed_wal_seconds.reset();
|
||||
|
||||
@@ -804,9 +782,6 @@ impl Collector for TimelineCollector {
|
||||
self.written_wal_bytes
|
||||
.with_label_values(labels)
|
||||
.set(tli.wal_storage.write_wal_bytes);
|
||||
self.interpreted_wal_reader_tasks
|
||||
.with_label_values(labels)
|
||||
.set(tli.interpreted_wal_reader_tasks as u64);
|
||||
self.written_wal_seconds
|
||||
.with_label_values(labels)
|
||||
.set(tli.wal_storage.write_wal_seconds);
|
||||
@@ -859,7 +834,6 @@ impl Collector for TimelineCollector {
|
||||
mfs.extend(self.disk_usage.collect());
|
||||
mfs.extend(self.acceptor_term.collect());
|
||||
mfs.extend(self.written_wal_bytes.collect());
|
||||
mfs.extend(self.interpreted_wal_reader_tasks.collect());
|
||||
mfs.extend(self.written_wal_seconds.collect());
|
||||
mfs.extend(self.flushed_wal_seconds.collect());
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::PostgresBackendReader;
|
||||
use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
|
||||
use safekeeper_api::ServerInfo;
|
||||
use std::future;
|
||||
@@ -200,14 +199,9 @@ impl SafekeeperPostgresHandler {
|
||||
pub async fn handle_start_wal_push<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), QueryError> {
|
||||
let mut tli: Option<WalResidentTimeline> = None;
|
||||
if let Err(end) = self
|
||||
.handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation)
|
||||
.await
|
||||
{
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
let handle_end_fut = pgb.handle_copy_stream_end(end);
|
||||
// If we managed to create the timeline, augment logging with current LSNs etc.
|
||||
@@ -227,8 +221,6 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tli: &mut Option<WalResidentTimeline>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// The `tli` parameter is only used for passing _out_ a timeline, one should
|
||||
// not have been passed in.
|
||||
@@ -257,17 +249,12 @@ impl SafekeeperPostgresHandler {
|
||||
conn_id: self.conn_id,
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
proto_version,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
global_timelines: self.global_timelines.clone(),
|
||||
};
|
||||
|
||||
// Read first message and create timeline if needed and allowed. This
|
||||
// won't be when timelines will be always created by storcon and
|
||||
// allow_timeline_creation becomes false.
|
||||
let res = network_reader
|
||||
.read_first_message(allow_timeline_creation)
|
||||
.await;
|
||||
// Read first message and create timeline if needed.
|
||||
let res = network_reader.read_first_message().await;
|
||||
|
||||
let network_res = if let Ok((timeline, next_msg)) = res {
|
||||
let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
|
||||
@@ -325,7 +312,6 @@ struct NetworkReader<'a, IO> {
|
||||
conn_id: ConnectionId,
|
||||
pgb_reader: &'a mut PostgresBackendReader<IO>,
|
||||
peer_addr: SocketAddr,
|
||||
proto_version: u32,
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
@@ -335,10 +321,9 @@ struct NetworkReader<'a, IO> {
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
async fn read_first_message(
|
||||
&mut self,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader, self.proto_version).await?;
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
@@ -350,22 +335,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
let tli = if allow_timeline_creation {
|
||||
self.global_timelines
|
||||
.create(
|
||||
self.ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.context("create timeline")?
|
||||
} else {
|
||||
self.global_timelines
|
||||
.get(self.ttid)
|
||||
.context("get timeline")?
|
||||
};
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
_ => {
|
||||
@@ -394,7 +368,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
));
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -402,10 +376,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
/// TODO: Return Ok(None) on graceful termination.
|
||||
async fn read_message<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
proto_version: u32,
|
||||
) -> Result<ProposerAcceptorMessage, CopyStreamHandlerEnd> {
|
||||
let copy_data = pgb_reader.read_copy_message().await?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
@@ -413,7 +386,6 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
mut next_msg: ProposerAcceptorMessage,
|
||||
proto_version: u32,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
/// Threshold for logging slow WalAcceptor sends.
|
||||
const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
|
||||
@@ -446,7 +418,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
|
||||
WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
|
||||
|
||||
next_msg = read_message(pgb_reader, proto_version).await?;
|
||||
next_msg = read_message(pgb_reader).await?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
|
||||
use safekeeper_api::models::HotStandbyFeedback;
|
||||
use safekeeper_api::Term;
|
||||
use safekeeper_api::INVALID_TERM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
@@ -29,7 +30,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -192,6 +193,36 @@ impl AcceptorState {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeerInfo {
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Term of the last entry.
|
||||
pub term: Term,
|
||||
/// LSN of the last record.
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
pub commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PersistedPeerInfo {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
backup_lsn: Lsn::INVALID,
|
||||
term: INVALID_TERM,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make clippy happy
|
||||
impl Default for PersistedPeerInfo {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// protocol messages
|
||||
|
||||
/// Initial Proposer -> Acceptor message
|
||||
@@ -317,14 +348,7 @@ pub enum ProposerAcceptorMessage {
|
||||
|
||||
impl ProposerAcceptorMessage {
|
||||
/// Parse proposer message.
|
||||
pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
|
||||
if proto_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"incompatible protocol version {}, expected {}",
|
||||
proto_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
pub fn parse(msg_bytes: Bytes) -> Result<ProposerAcceptorMessage> {
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
@@ -986,7 +1010,7 @@ where
|
||||
|
||||
/// Update commit_lsn from peer safekeeper data.
|
||||
pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
|
||||
if Lsn(sk_info.commit_lsn) != Lsn::INVALID {
|
||||
if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
|
||||
// Note: the check is too restrictive, generally we can update local
|
||||
// commit_lsn if our history matches (is part of) history of advanced
|
||||
// commit_lsn provider.
|
||||
@@ -1001,20 +1025,12 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::future::BoxFuture;
|
||||
|
||||
use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
|
||||
use safekeeper_api::{
|
||||
membership::{Configuration, MemberSet, SafekeeperId},
|
||||
ServerInfo,
|
||||
};
|
||||
use safekeeper_api::ServerInfo;
|
||||
|
||||
use super::*;
|
||||
use crate::state::{EvictionState, TimelinePersistentState};
|
||||
use std::{
|
||||
ops::Deref,
|
||||
str::FromStr,
|
||||
time::{Instant, UNIX_EPOCH},
|
||||
};
|
||||
use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
|
||||
use std::{ops::Deref, str::FromStr, time::Instant};
|
||||
|
||||
// fake storage for tests
|
||||
struct InMemoryState {
|
||||
@@ -1297,21 +1313,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_sk_state_bincode_serde_roundtrip() {
|
||||
use utils::Hex;
|
||||
let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
|
||||
let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
|
||||
let state = TimelinePersistentState {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mconf: Configuration {
|
||||
generation: 42,
|
||||
members: MemberSet::new(vec![SafekeeperId {
|
||||
id: NodeId(1),
|
||||
host: "hehe.org".to_owned(),
|
||||
pg_port: 5432,
|
||||
}])
|
||||
.expect("duplicate member"),
|
||||
new_members: None,
|
||||
},
|
||||
acceptor_state: AcceptorState {
|
||||
term: 42,
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
@@ -1335,13 +1342,70 @@ mod tests {
|
||||
backup_lsn: Lsn(1234567300),
|
||||
peer_horizon_lsn: Lsn(9999999),
|
||||
remote_consistent_lsn: Lsn(1234560000),
|
||||
peers: PersistedPeers(vec![(
|
||||
NodeId(1),
|
||||
PersistedPeerInfo {
|
||||
backup_lsn: Lsn(1234567000),
|
||||
term: 42,
|
||||
flush_lsn: Lsn(1234567800 - 8),
|
||||
commit_lsn: Lsn(1234567600),
|
||||
},
|
||||
)]),
|
||||
partial_backup: crate::wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: UNIX_EPOCH,
|
||||
};
|
||||
|
||||
let ser = state.ser().unwrap();
|
||||
|
||||
#[rustfmt::skip]
|
||||
let expected = [
|
||||
// tenant_id as length prefixed hex
|
||||
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
|
||||
// timeline_id as length prefixed hex
|
||||
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34,
|
||||
// term
|
||||
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// length prefix
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// unsure why this order is swapped
|
||||
0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// pg_version
|
||||
0x0e, 0x00, 0x00, 0x00,
|
||||
// systemid
|
||||
0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
|
||||
// wal_seg_size
|
||||
0x78, 0x56, 0x34, 0x12,
|
||||
// pguuid as length prefixed hex
|
||||
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
|
||||
|
||||
// timeline_start_lsn
|
||||
0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00,
|
||||
0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
// length prefix for persistentpeers
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// nodeid
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// backuplsn
|
||||
0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
// partial_backup
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// eviction_state
|
||||
0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
assert_eq!(Hex(&ser), Hex(&expected));
|
||||
|
||||
let deser = TimelinePersistentState::des(&ser).unwrap();
|
||||
|
||||
assert_eq!(deser, state);
|
||||
|
||||
@@ -1,330 +1,96 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::future::Either;
|
||||
use anyhow::Context;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
|
||||
use postgres_ffi::waldecoder::WalDecodeError;
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
|
||||
use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::sync::mpsc::error::SendError;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::MissedTickBehavior;
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::Compression;
|
||||
use utils::postgres_client::InterpretedFormat;
|
||||
use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
|
||||
use wal_decoder::wire_format::ToWireFormat;
|
||||
|
||||
use crate::metrics::WAL_READERS;
|
||||
use crate::send_wal::{EndWatchView, WalSenderGuard};
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::{StreamingWalReader, WalBytes};
|
||||
use crate::send_wal::EndWatchView;
|
||||
use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
|
||||
|
||||
/// Identifier used to differentiate between senders of the same
|
||||
/// shard.
|
||||
///
|
||||
/// In the steady state there's only one, but two pageservers may
|
||||
/// temporarily have the same shard attached and attempt to ingest
|
||||
/// WAL for it. See also [`ShardSenderId`].
|
||||
#[derive(Hash, Eq, PartialEq, Copy, Clone)]
|
||||
struct SenderId(u8);
|
||||
|
||||
impl SenderId {
|
||||
fn first() -> Self {
|
||||
SenderId(0)
|
||||
}
|
||||
|
||||
fn next(&self) -> Self {
|
||||
SenderId(self.0.checked_add(1).expect("few senders"))
|
||||
}
|
||||
/// Shard-aware interpreted record sender.
|
||||
/// This is used for sending WAL to the pageserver. Said WAL
|
||||
/// is pre-interpreted and filtered for the shard.
|
||||
pub(crate) struct InterpretedWalSender<'a, IO> {
|
||||
pub(crate) format: InterpretedFormat,
|
||||
pub(crate) compression: Option<Compression>,
|
||||
pub(crate) pgb: &'a mut PostgresBackend<IO>,
|
||||
pub(crate) wal_stream_builder: WalReaderStreamBuilder,
|
||||
pub(crate) end_watch_view: EndWatchView,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) pg_version: u32,
|
||||
pub(crate) appname: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Hash, Eq, PartialEq)]
|
||||
struct ShardSenderId {
|
||||
shard: ShardIdentity,
|
||||
sender_id: SenderId,
|
||||
}
|
||||
|
||||
impl Display for ShardSenderId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug())
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardSenderId {
|
||||
fn new(shard: ShardIdentity, sender_id: SenderId) -> Self {
|
||||
ShardSenderId { shard, sender_id }
|
||||
}
|
||||
|
||||
fn shard(&self) -> ShardIdentity {
|
||||
self.shard
|
||||
}
|
||||
}
|
||||
|
||||
/// Shard-aware fan-out interpreted record reader.
|
||||
/// Reads WAL from disk, decodes it, intepretets it, and sends
|
||||
/// it to any [`InterpretedWalSender`] connected to it.
|
||||
/// Each [`InterpretedWalSender`] corresponds to one shard
|
||||
/// and gets interpreted records concerning that shard only.
|
||||
pub(crate) struct InterpretedWalReader {
|
||||
wal_stream: StreamingWalReader,
|
||||
shard_senders: HashMap<ShardIdentity, smallvec::SmallVec<[ShardSenderState; 1]>>,
|
||||
shard_notification_rx: Option<tokio::sync::mpsc::UnboundedReceiver<AttachShardNotification>>,
|
||||
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
/// A handle for [`InterpretedWalReader`] which allows for interacting with it
|
||||
/// when it runs as a separate tokio task.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct InterpretedWalReaderHandle {
|
||||
join_handle: JoinHandle<Result<(), InterpretedWalReaderError>>,
|
||||
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
|
||||
shard_notification_tx: tokio::sync::mpsc::UnboundedSender<AttachShardNotification>,
|
||||
}
|
||||
|
||||
struct ShardSenderState {
|
||||
sender_id: SenderId,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
next_record_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// State of [`InterpretedWalReader`] visible outside of the task running it.
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum InterpretedWalReaderState {
|
||||
Running { current_position: Lsn },
|
||||
Done,
|
||||
}
|
||||
|
||||
pub(crate) struct Batch {
|
||||
struct Batch {
|
||||
wal_end_lsn: Lsn,
|
||||
available_wal_end_lsn: Lsn,
|
||||
records: InterpretedWalRecords,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum InterpretedWalReaderError {
|
||||
/// Handler initiates the end of streaming.
|
||||
#[error("decode error: {0}")]
|
||||
Decode(#[from] WalDecodeError),
|
||||
#[error("read or interpret error: {0}")]
|
||||
ReadOrInterpret(#[from] anyhow::Error),
|
||||
#[error("wal stream closed")]
|
||||
WalStreamClosed,
|
||||
}
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
/// Send interpreted WAL to a receiver.
|
||||
/// Stops when an error occurs or the receiver is caught up and there's no active compute.
|
||||
///
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut wal_position = self.wal_stream_builder.start_pos();
|
||||
let mut wal_decoder =
|
||||
WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
|
||||
|
||||
impl InterpretedWalReaderState {
|
||||
fn current_position(&self) -> Option<Lsn> {
|
||||
match self {
|
||||
InterpretedWalReaderState::Running {
|
||||
current_position, ..
|
||||
} => Some(*current_position),
|
||||
InterpretedWalReaderState::Done => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
pub(crate) struct AttachShardNotification {
|
||||
shard_id: ShardIdentity,
|
||||
sender: tokio::sync::mpsc::Sender<Batch>,
|
||||
start_pos: Lsn,
|
||||
}
|
||||
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
|
||||
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
keepalive_ticker.reset();
|
||||
|
||||
impl InterpretedWalReader {
|
||||
/// Spawn the reader in a separate tokio task and return a handle
|
||||
pub(crate) fn spawn(
|
||||
wal_stream: StreamingWalReader,
|
||||
start_pos: Lsn,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
appname: &Option<String>,
|
||||
) -> InterpretedWalReaderHandle {
|
||||
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
|
||||
current_position: start_pos,
|
||||
}));
|
||||
|
||||
let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let reader = InterpretedWalReader {
|
||||
wal_stream,
|
||||
shard_senders: HashMap::from([(
|
||||
shard,
|
||||
smallvec::smallvec![ShardSenderState {
|
||||
sender_id: SenderId::first(),
|
||||
tx,
|
||||
next_record_lsn: start_pos,
|
||||
}],
|
||||
)]),
|
||||
shard_notification_rx: Some(shard_notification_rx),
|
||||
state: state.clone(),
|
||||
pg_version,
|
||||
};
|
||||
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")])
|
||||
.unwrap();
|
||||
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = reader.run_impl(start_pos).await;
|
||||
if let Err(ref err) = res {
|
||||
tracing::error!("Task finished with error: {err}");
|
||||
}
|
||||
res
|
||||
}
|
||||
.instrument(info_span!("interpreted wal reader")),
|
||||
);
|
||||
|
||||
InterpretedWalReaderHandle {
|
||||
join_handle,
|
||||
state,
|
||||
shard_notification_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct the reader without spawning anything
|
||||
/// Callers should drive the future returned by [`Self::run`].
|
||||
pub(crate) fn new(
|
||||
wal_stream: StreamingWalReader,
|
||||
start_pos: Lsn,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
) -> InterpretedWalReader {
|
||||
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
|
||||
current_position: start_pos,
|
||||
}));
|
||||
|
||||
InterpretedWalReader {
|
||||
wal_stream,
|
||||
shard_senders: HashMap::from([(
|
||||
shard,
|
||||
smallvec::smallvec![ShardSenderState {
|
||||
sender_id: SenderId::first(),
|
||||
tx,
|
||||
next_record_lsn: start_pos,
|
||||
}],
|
||||
)]),
|
||||
shard_notification_rx: None,
|
||||
state: state.clone(),
|
||||
pg_version,
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry point for future (polling) based wal reader.
|
||||
pub(crate) async fn run(
|
||||
self,
|
||||
start_pos: Lsn,
|
||||
appname: &Option<String>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")])
|
||||
.unwrap();
|
||||
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = self.run_impl(start_pos).await;
|
||||
if let Err(err) = res {
|
||||
tracing::error!("Interpreted wal reader encountered error: {err}");
|
||||
} else {
|
||||
tracing::info!("Interpreted wal reader exiting");
|
||||
}
|
||||
|
||||
Err(CopyStreamHandlerEnd::Other(anyhow!(
|
||||
"interpreted wal reader finished"
|
||||
)))
|
||||
}
|
||||
|
||||
/// Send interpreted WAL to one or more [`InterpretedWalSender`]s
|
||||
/// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`]
|
||||
/// goes out of scope.
|
||||
async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> {
|
||||
let defer_state = self.state.clone();
|
||||
scopeguard::defer! {
|
||||
*defer_state.write().unwrap() = InterpretedWalReaderState::Done;
|
||||
}
|
||||
|
||||
let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Main branch for reading WAL and forwarding it
|
||||
wal_or_reset = self.wal_stream.next() => {
|
||||
let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
|
||||
let WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: _,
|
||||
wal_end_lsn,
|
||||
available_wal_end_lsn,
|
||||
} = match wal {
|
||||
Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?,
|
||||
None => {
|
||||
// [`StreamingWalReader::next`] is an endless stream of WAL.
|
||||
// It shouldn't ever finish unless it panicked or became internally
|
||||
// inconsistent.
|
||||
return Result::Err(InterpretedWalReaderError::WalStreamClosed);
|
||||
}
|
||||
// Get some WAL from the stream and then: decode, interpret and push it down the
|
||||
// pipeline.
|
||||
wal = stream.next(), if tx.capacity() > 0 => {
|
||||
let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
|
||||
Some(some) => some?,
|
||||
None => { break; }
|
||||
};
|
||||
|
||||
wal_position = wal_end_lsn;
|
||||
wal_decoder.feed_bytes(&wal);
|
||||
|
||||
// Deserialize and interpret WAL records from this batch of WAL.
|
||||
// Interpreted records for each shard are collected separately.
|
||||
let shard_ids = self.shard_senders.keys().copied().collect::<Vec<_>>();
|
||||
let mut records_by_sender: HashMap<ShardSenderId, Vec<InterpretedWalRecord>> = HashMap::new();
|
||||
let mut records = Vec::new();
|
||||
let mut max_next_record_lsn = None;
|
||||
while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()?
|
||||
while let Some((next_record_lsn, recdata)) = wal_decoder
|
||||
.poll_decode()
|
||||
.with_context(|| "Failed to decode WAL")?
|
||||
{
|
||||
assert!(next_record_lsn.is_aligned());
|
||||
max_next_record_lsn = Some(next_record_lsn);
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard_ids,
|
||||
&self.shard,
|
||||
next_record_lsn,
|
||||
self.pg_version,
|
||||
)
|
||||
.with_context(|| "Failed to interpret WAL")?;
|
||||
|
||||
for (shard, record) in interpreted {
|
||||
if record.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut states_iter = self.shard_senders
|
||||
.get(&shard)
|
||||
.expect("keys collected above")
|
||||
.iter()
|
||||
.filter(|state| record.next_record_lsn > state.next_record_lsn)
|
||||
.peekable();
|
||||
while let Some(state) = states_iter.next() {
|
||||
let shard_sender_id = ShardSenderId::new(shard, state.sender_id);
|
||||
|
||||
// The most commont case is one sender per shard. Peek and break to avoid the
|
||||
// clone in that situation.
|
||||
if states_iter.peek().is_none() {
|
||||
records_by_sender.entry(shard_sender_id).or_default().push(record);
|
||||
break;
|
||||
} else {
|
||||
records_by_sender.entry(shard_sender_id).or_default().push(record.clone());
|
||||
}
|
||||
}
|
||||
if !interpreted.is_empty() {
|
||||
records.push(interpreted);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -333,169 +99,19 @@ impl InterpretedWalReader {
|
||||
None => { continue; }
|
||||
};
|
||||
|
||||
// Update the current position such that new receivers can decide
|
||||
// whether to attach to us or spawn a new WAL reader.
|
||||
match &mut *self.state.write().unwrap() {
|
||||
InterpretedWalReaderState::Running { current_position, .. } => {
|
||||
*current_position = max_next_record_lsn;
|
||||
},
|
||||
InterpretedWalReaderState::Done => {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
// Send interpreted records downstream. Anything that has already been seen
|
||||
// by a shard is filtered out.
|
||||
let mut shard_senders_to_remove = Vec::new();
|
||||
for (shard, states) in &mut self.shard_senders {
|
||||
for state in states {
|
||||
if max_next_record_lsn <= state.next_record_lsn {
|
||||
continue;
|
||||
}
|
||||
|
||||
let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
|
||||
let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
|
||||
|
||||
let batch = InterpretedWalRecords {
|
||||
records,
|
||||
next_record_lsn: Some(max_next_record_lsn),
|
||||
};
|
||||
|
||||
let res = state.tx.send(Batch {
|
||||
wal_end_lsn,
|
||||
available_wal_end_lsn,
|
||||
records: batch,
|
||||
}).await;
|
||||
|
||||
if res.is_err() {
|
||||
shard_senders_to_remove.push(shard_sender_id);
|
||||
} else {
|
||||
state.next_record_lsn = max_next_record_lsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up any shard senders that have dropped out.
|
||||
// This is inefficient, but such events are rare (connection to PS termination)
|
||||
// and the number of subscriptions on the same shards very small (only one
|
||||
// for the steady state).
|
||||
for to_remove in shard_senders_to_remove {
|
||||
let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above");
|
||||
if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) {
|
||||
shard_senders.remove(idx);
|
||||
tracing::info!("Removed shard sender {}", to_remove);
|
||||
}
|
||||
|
||||
if shard_senders.is_empty() {
|
||||
self.shard_senders.remove(&to_remove.shard());
|
||||
}
|
||||
}
|
||||
},
|
||||
// Listen for new shards that want to attach to this reader.
|
||||
// If the reader is not running as a task, then this is not supported
|
||||
// (see the pending branch below).
|
||||
notification = match self.shard_notification_rx.as_mut() {
|
||||
Some(rx) => Either::Left(rx.recv()),
|
||||
None => Either::Right(std::future::pending())
|
||||
} => {
|
||||
if let Some(n) = notification {
|
||||
let AttachShardNotification { shard_id, sender, start_pos } = n;
|
||||
|
||||
// Update internal and external state, then reset the WAL stream
|
||||
// if required.
|
||||
let senders = self.shard_senders.entry(shard_id).or_default();
|
||||
let new_sender_id = match senders.last() {
|
||||
Some(sender) => sender.sender_id.next(),
|
||||
None => SenderId::first()
|
||||
};
|
||||
|
||||
senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
|
||||
let current_pos = self.state.read().unwrap().current_position().unwrap();
|
||||
if start_pos < current_pos {
|
||||
self.wal_stream.reset(start_pos).await;
|
||||
wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Added shard sender {} with start_pos={} current_pos={}",
|
||||
ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InterpretedWalReaderHandle {
|
||||
/// Fan-out the reader by attaching a new shard to it
|
||||
pub(crate) fn fanout(
|
||||
&self,
|
||||
shard_id: ShardIdentity,
|
||||
sender: tokio::sync::mpsc::Sender<Batch>,
|
||||
start_pos: Lsn,
|
||||
) -> Result<(), SendError<AttachShardNotification>> {
|
||||
self.shard_notification_tx.send(AttachShardNotification {
|
||||
shard_id,
|
||||
sender,
|
||||
start_pos,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the current WAL position of the reader
|
||||
pub(crate) fn current_position(&self) -> Option<Lsn> {
|
||||
self.state.read().unwrap().current_position()
|
||||
}
|
||||
|
||||
pub(crate) fn abort(&self) {
|
||||
self.join_handle.abort()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for InterpretedWalReaderHandle {
|
||||
fn drop(&mut self) {
|
||||
tracing::info!("Aborting interpreted wal reader");
|
||||
self.abort()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct InterpretedWalSender<'a, IO> {
|
||||
pub(crate) format: InterpretedFormat,
|
||||
pub(crate) compression: Option<Compression>,
|
||||
pub(crate) appname: Option<String>,
|
||||
|
||||
pub(crate) tli: WalResidentTimeline,
|
||||
pub(crate) start_lsn: Lsn,
|
||||
|
||||
pub(crate) pgb: &'a mut PostgresBackend<IO>,
|
||||
pub(crate) end_watch_view: EndWatchView,
|
||||
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
|
||||
pub(crate) rx: tokio::sync::mpsc::Receiver<Batch>,
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
/// Send interpreted WAL records over the network.
|
||||
/// Also manages keep-alives if nothing was sent for a while.
|
||||
pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
|
||||
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
keepalive_ticker.reset();
|
||||
|
||||
let mut wal_position = self.start_lsn;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
batch = self.rx.recv() => {
|
||||
let batch = match batch {
|
||||
Some(b) => b,
|
||||
None => {
|
||||
return Result::Err(
|
||||
CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early"))
|
||||
);
|
||||
}
|
||||
let batch = InterpretedWalRecords {
|
||||
records,
|
||||
next_record_lsn: Some(max_next_record_lsn),
|
||||
};
|
||||
|
||||
wal_position = batch.wal_end_lsn;
|
||||
tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
|
||||
},
|
||||
// For a previously interpreted batch, serialize it and push it down the wire.
|
||||
batch = rx.recv() => {
|
||||
let batch = match batch {
|
||||
Some(b) => b,
|
||||
None => { break; }
|
||||
};
|
||||
|
||||
let buf = batch
|
||||
.records
|
||||
@@ -516,21 +132,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
})).await?;
|
||||
}
|
||||
// Send a periodic keep alive when the connection has been idle for a while.
|
||||
// Since we've been idle, also check if we can stop streaming.
|
||||
_ = keepalive_ticker.tick() => {
|
||||
if let Some(remote_consistent_lsn) = self.wal_sender_guard
|
||||
.walsenders()
|
||||
.get_ws_remote_consistent_lsn(self.wal_sender_guard.id())
|
||||
{
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Stop streaming if the receivers are caught up and
|
||||
// there's no active compute. This causes the loop in
|
||||
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
|
||||
// to exit and terminate the WAL stream.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
self.pgb
|
||||
.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
|
||||
wal_end: self.end_watch_view.get().0,
|
||||
@@ -538,259 +140,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
request_reply: true,
|
||||
}))
|
||||
.await?;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The loop above ends when the receiver is caught up and there's no more WAL to send.
|
||||
Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
|
||||
self.appname, wal_position,
|
||||
)))
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use tokio::sync::mpsc::error::TryRecvError;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
send_interpreted_wal::{Batch, InterpretedWalReader},
|
||||
test_utils::Env,
|
||||
wal_reader_stream::StreamingWalReader,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_interpreted_wal_reader_fanout() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
tracing::info!("Doing first round of reads ...");
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let shard_0 = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let shard_1 = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut shards = HashMap::new();
|
||||
|
||||
for shard_number in 0..SHARD_COUNT {
|
||||
let shard_id = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
shards.insert(shard_id, (Some(tx), Some(rx)));
|
||||
}
|
||||
|
||||
let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap();
|
||||
let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap();
|
||||
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
shard_0_tx,
|
||||
shard_0,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
tracing::info!("Reading all WAL with only shard 0 attached ...");
|
||||
|
||||
let mut shard_0_interpreted_records = Vec::new();
|
||||
while let Some(batch) = shard_0_rx.recv().await {
|
||||
shard_0_interpreted_records.push(batch.records);
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap();
|
||||
let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap();
|
||||
|
||||
tracing::info!("Attaching shard 1 to the reader at start of WAL");
|
||||
handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap();
|
||||
|
||||
tracing::info!("Reading all WAL with shard 0 and shard 1 attached ...");
|
||||
|
||||
let mut shard_1_interpreted_records = Vec::new();
|
||||
while let Some(batch) = shard_1_rx.recv().await {
|
||||
shard_1_interpreted_records.push(batch.records);
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// This test uses logical messages. Those only go to shard 0. Check that the
|
||||
// filtering worked and shard 1 did not get any.
|
||||
assert!(shard_1_interpreted_records
|
||||
.iter()
|
||||
.all(|recs| recs.records.is_empty()));
|
||||
|
||||
// Shard 0 should not receive anything more since the reader is
|
||||
// going through wal that it has already processed.
|
||||
let res = shard_0_rx.try_recv();
|
||||
if let Ok(ref ok) = res {
|
||||
tracing::error!(
|
||||
"Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}",
|
||||
ok.wal_end_lsn,
|
||||
ok.available_wal_end_lsn
|
||||
);
|
||||
}
|
||||
assert!(matches!(res, Err(TryRecvError::Empty)));
|
||||
|
||||
// Check that the next records lsns received by the two shards match up.
|
||||
let shard_0_next_lsns = shard_0_interpreted_records
|
||||
.iter()
|
||||
.map(|recs| recs.next_record_lsn)
|
||||
.collect::<Vec<_>>();
|
||||
let shard_1_next_lsns = shard_1_interpreted_records
|
||||
.iter()
|
||||
.map(|recs| recs.next_record_lsn)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(shard_0_next_lsns, shard_1_next_lsns);
|
||||
|
||||
handle.abort();
|
||||
let mut done = false;
|
||||
for _ in 0..5 {
|
||||
if handle.current_position().is_none() {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_interpreted_wal_reader_same_shard_fanout() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
const ATTACHED_SHARDS: u8 = 4;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let shard_0 = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
let mut batch_receivers = vec![rx];
|
||||
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
tx,
|
||||
shard_0,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
for _ in 0..(ATTACHED_SHARDS - 1) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
handle.fanout(shard_0, tx, start_lsn).unwrap();
|
||||
batch_receivers.push(rx);
|
||||
}
|
||||
|
||||
loop {
|
||||
let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
|
||||
for rx in batch_receivers.iter_mut().skip(1) {
|
||||
let other_batch = rx.recv().await.unwrap();
|
||||
|
||||
assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
|
||||
assert_eq!(
|
||||
batch.available_wal_end_lsn,
|
||||
other_batch.available_wal_end_lsn
|
||||
);
|
||||
}
|
||||
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
handle.abort();
|
||||
let mut done = false;
|
||||
for _ in 0..5 {
|
||||
if handle.current_position().is_none() {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,18 +2,16 @@
|
||||
//! with the "START_REPLICATION" message, and registry of walsenders.
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS};
|
||||
use crate::metrics::RECEIVED_PS_FEEDBACKS;
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::TermLsn;
|
||||
use crate::send_interpreted_wal::{
|
||||
Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender,
|
||||
};
|
||||
use crate::send_interpreted_wal::InterpretedWalSender;
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::StreamingWalReader;
|
||||
use crate::wal_reader_stream::WalReaderStreamBuilder;
|
||||
use crate::wal_storage::WalReader;
|
||||
use anyhow::{bail, Context as AnyhowContext};
|
||||
use bytes::Bytes;
|
||||
use futures::FutureExt;
|
||||
use futures::future::Either;
|
||||
use parking_lot::Mutex;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
|
||||
@@ -21,16 +19,16 @@ use postgres_ffi::get_current_timestamp;
|
||||
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
||||
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
||||
use safekeeper_api::models::{
|
||||
HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
|
||||
INVALID_FULL_TRANSACTION_ID,
|
||||
ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
|
||||
WalSenderState, INVALID_FULL_TRANSACTION_ID,
|
||||
};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use utils::failpoint_support;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use itertools::Itertools;
|
||||
use std::cmp::{max, min};
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
@@ -52,12 +50,6 @@ pub struct WalSenders {
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
}
|
||||
|
||||
pub struct WalSendersTimelineMetricValues {
|
||||
pub ps_feedback_counter: u64,
|
||||
pub last_ps_feedback: PageserverFeedback,
|
||||
pub interpreted_wal_reader_tasks: usize,
|
||||
}
|
||||
|
||||
impl WalSenders {
|
||||
pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
|
||||
Arc::new(WalSenders {
|
||||
@@ -68,8 +60,21 @@ impl WalSenders {
|
||||
|
||||
/// Register new walsender. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
fn register(self: &Arc<WalSenders>, walsender_state: WalSenderState) -> WalSenderGuard {
|
||||
fn register(
|
||||
self: &Arc<WalSenders>,
|
||||
ttid: TenantTimelineId,
|
||||
addr: SocketAddr,
|
||||
conn_id: ConnectionId,
|
||||
appname: Option<String>,
|
||||
) -> WalSenderGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walsender_state = WalSenderState {
|
||||
ttid,
|
||||
addr,
|
||||
conn_id,
|
||||
appname,
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
};
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walsender_state);
|
||||
@@ -85,79 +90,9 @@ impl WalSenders {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_or_update_interpreted_reader<
|
||||
FUp: FnOnce(&Arc<InterpretedWalReaderHandle>) -> anyhow::Result<()>,
|
||||
FNew: FnOnce() -> InterpretedWalReaderHandle,
|
||||
>(
|
||||
self: &Arc<WalSenders>,
|
||||
id: WalSenderId,
|
||||
start_pos: Lsn,
|
||||
max_delta_for_fanout: Option<u64>,
|
||||
update: FUp,
|
||||
create: FNew,
|
||||
) -> anyhow::Result<()> {
|
||||
let state = &mut self.mutex.lock();
|
||||
|
||||
let mut selected_interpreted_reader = None;
|
||||
for slot in state.slots.iter().flatten() {
|
||||
if let WalSenderState::Interpreted(slot_state) = slot {
|
||||
if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader {
|
||||
let select = match (interpreted_reader.current_position(), max_delta_for_fanout)
|
||||
{
|
||||
(Some(pos), Some(max_delta)) => {
|
||||
let delta = pos.0.abs_diff(start_pos.0);
|
||||
delta <= max_delta
|
||||
}
|
||||
// Reader is not active
|
||||
(None, _) => false,
|
||||
// Gating fanout by max delta is disabled.
|
||||
// Attach to any active reader.
|
||||
(_, None) => true,
|
||||
};
|
||||
|
||||
if select {
|
||||
selected_interpreted_reader = Some(interpreted_reader.clone());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let slot = state.get_slot_mut(id);
|
||||
let slot_state = match slot {
|
||||
WalSenderState::Interpreted(s) => s,
|
||||
WalSenderState::Vanilla(_) => unreachable!(),
|
||||
};
|
||||
|
||||
let selected_or_new = match selected_interpreted_reader {
|
||||
Some(selected) => {
|
||||
update(&selected)?;
|
||||
selected
|
||||
}
|
||||
None => Arc::new(create()),
|
||||
};
|
||||
|
||||
slot_state.interpreted_wal_reader = Some(selected_or_new);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get state of all walsenders.
|
||||
pub fn get_all_public(self: &Arc<WalSenders>) -> Vec<safekeeper_api::models::WalSenderState> {
|
||||
self.mutex
|
||||
.lock()
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|state| match state {
|
||||
WalSenderState::Vanilla(s) => {
|
||||
safekeeper_api::models::WalSenderState::Vanilla(s.clone())
|
||||
}
|
||||
WalSenderState::Interpreted(s) => {
|
||||
safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone())
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
}
|
||||
|
||||
/// Get LSN of the most lagging pageserver receiver. Return None if there are no
|
||||
@@ -168,7 +103,7 @@ impl WalSenders {
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.filter_map(|s| match s.get_feedback() {
|
||||
.filter_map(|s| match s.feedback {
|
||||
ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
|
||||
ReplicationFeedback::Standby(_) => None,
|
||||
})
|
||||
@@ -176,25 +111,9 @@ impl WalSenders {
|
||||
}
|
||||
|
||||
/// Returns total counter of pageserver feedbacks received and last feedback.
|
||||
pub fn info_for_metrics(self: &Arc<WalSenders>) -> WalSendersTimelineMetricValues {
|
||||
pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
|
||||
let shared = self.mutex.lock();
|
||||
|
||||
let interpreted_wal_reader_tasks = shared
|
||||
.slots
|
||||
.iter()
|
||||
.filter_map(|ss| match ss {
|
||||
Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(),
|
||||
Some(WalSenderState::Vanilla(_)) => None,
|
||||
None => None,
|
||||
})
|
||||
.unique_by(|reader| Arc::as_ptr(reader))
|
||||
.count();
|
||||
|
||||
WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter: shared.ps_feedback_counter,
|
||||
last_ps_feedback: shared.last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
}
|
||||
(shared.ps_feedback_counter, shared.last_ps_feedback)
|
||||
}
|
||||
|
||||
/// Get aggregated hot standby feedback (we send it to compute).
|
||||
@@ -205,7 +124,7 @@ impl WalSenders {
|
||||
/// Record new pageserver feedback, update aggregated values.
|
||||
fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
*shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback);
|
||||
shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
|
||||
shared.last_ps_feedback = *feedback;
|
||||
shared.ps_feedback_counter += 1;
|
||||
drop(shared);
|
||||
@@ -224,10 +143,10 @@ impl WalSenders {
|
||||
"Record standby reply: ts={} apply_lsn={}",
|
||||
reply.reply_ts, reply.apply_lsn
|
||||
);
|
||||
match &mut slot.get_mut_feedback() {
|
||||
match &mut slot.feedback {
|
||||
ReplicationFeedback::Standby(sf) => sf.reply = *reply,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: *reply,
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
})
|
||||
@@ -239,10 +158,10 @@ impl WalSenders {
|
||||
fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
let slot = shared.get_slot_mut(id);
|
||||
match &mut slot.get_mut_feedback() {
|
||||
match &mut slot.feedback {
|
||||
ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: StandbyReply::empty(),
|
||||
hs_feedback: *feedback,
|
||||
})
|
||||
@@ -256,7 +175,7 @@ impl WalSenders {
|
||||
pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
|
||||
let shared = self.mutex.lock();
|
||||
let slot = shared.get_slot(id);
|
||||
match slot.get_feedback() {
|
||||
match slot.feedback {
|
||||
ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
|
||||
_ => None,
|
||||
}
|
||||
@@ -280,47 +199,6 @@ struct WalSendersShared {
|
||||
slots: Vec<Option<WalSenderState>>,
|
||||
}
|
||||
|
||||
/// Safekeeper internal definitions of wal sender state
|
||||
///
|
||||
/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may
|
||||
/// include state that we don not wish to expose to the public api.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum WalSenderState {
|
||||
Vanilla(VanillaWalSenderInternalState),
|
||||
Interpreted(InterpretedWalSenderInternalState),
|
||||
}
|
||||
|
||||
type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct InterpretedWalSenderInternalState {
|
||||
public_state: safekeeper_api::models::InterpretedWalSenderState,
|
||||
interpreted_wal_reader: Option<Arc<InterpretedWalReaderHandle>>,
|
||||
}
|
||||
|
||||
impl WalSenderState {
|
||||
fn get_addr(&self) -> &SocketAddr {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &state.addr,
|
||||
WalSenderState::Interpreted(state) => &state.public_state.addr,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_feedback(&self) -> &ReplicationFeedback {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &state.feedback,
|
||||
WalSenderState::Interpreted(state) => &state.public_state.feedback,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &mut state.feedback,
|
||||
WalSenderState::Interpreted(state) => &mut state.public_state.feedback,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WalSendersShared {
|
||||
fn new() -> Self {
|
||||
WalSendersShared {
|
||||
@@ -347,7 +225,7 @@ impl WalSendersShared {
|
||||
let mut agg = HotStandbyFeedback::empty();
|
||||
let mut reply_agg = StandbyReply::empty();
|
||||
for ws_state in self.slots.iter().flatten() {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
|
||||
let hs_feedback = standby_feedback.hs_feedback;
|
||||
// doing Option math like op1.iter().chain(op2.iter()).min()
|
||||
// would be nicer, but we serialize/deserialize this struct
|
||||
@@ -439,7 +317,7 @@ impl SafekeeperPostgresHandler {
|
||||
/// Wrapper around handle_start_replication_guts handling result. Error is
|
||||
/// handled here while we're still in walsender ttid span; with API
|
||||
/// extension, this can probably be moved into postgres_backend.
|
||||
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
@@ -464,7 +342,7 @@ impl SafekeeperPostgresHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
@@ -474,30 +352,12 @@ impl SafekeeperPostgresHandler {
|
||||
let appname = self.appname.clone();
|
||||
|
||||
// Use a guard object to remove our entry from the timeline when we are done.
|
||||
let ws_guard = match self.protocol() {
|
||||
PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register(
|
||||
WalSenderState::Vanilla(VanillaWalSenderInternalState {
|
||||
ttid: self.ttid,
|
||||
addr: *pgb.get_peer_addr(),
|
||||
conn_id: self.conn_id,
|
||||
appname: self.appname.clone(),
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
}),
|
||||
)),
|
||||
PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register(
|
||||
WalSenderState::Interpreted(InterpretedWalSenderInternalState {
|
||||
public_state: safekeeper_api::models::InterpretedWalSenderState {
|
||||
ttid: self.ttid,
|
||||
shard: self.shard.unwrap(),
|
||||
addr: *pgb.get_peer_addr(),
|
||||
conn_id: self.conn_id,
|
||||
appname: self.appname.clone(),
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
},
|
||||
interpreted_wal_reader: None,
|
||||
}),
|
||||
)),
|
||||
};
|
||||
let ws_guard = Arc::new(tli.get_walsenders().register(
|
||||
self.ttid,
|
||||
*pgb.get_peer_addr(),
|
||||
self.conn_id,
|
||||
self.appname.clone(),
|
||||
));
|
||||
|
||||
// Walsender can operate in one of two modes which we select by
|
||||
// application_name: give only committed WAL (used by pageserver) or all
|
||||
@@ -543,7 +403,7 @@ impl SafekeeperPostgresHandler {
|
||||
pgb,
|
||||
// should succeed since we're already holding another guard
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
appname: appname.clone(),
|
||||
appname,
|
||||
start_pos,
|
||||
end_pos,
|
||||
term,
|
||||
@@ -553,7 +413,7 @@ impl SafekeeperPostgresHandler {
|
||||
send_buf: vec![0u8; MAX_SEND_SIZE],
|
||||
};
|
||||
|
||||
FutureExt::boxed(sender.run())
|
||||
Either::Left(sender.run())
|
||||
}
|
||||
PostgresClientProtocol::Interpreted {
|
||||
format,
|
||||
@@ -561,96 +421,27 @@ impl SafekeeperPostgresHandler {
|
||||
} => {
|
||||
let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
|
||||
let end_watch_view = end_watch.view();
|
||||
let wal_residence_guard = tli.wal_residence_guard().await?;
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(2);
|
||||
let shard = self.shard.unwrap();
|
||||
let wal_stream_builder = WalReaderStreamBuilder {
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_pos,
|
||||
end_pos,
|
||||
term,
|
||||
end_watch,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
};
|
||||
|
||||
if self.conf.wal_reader_fanout && !shard.is_unsharded() {
|
||||
let ws_id = ws_guard.id();
|
||||
ws_guard.walsenders().create_or_update_interpreted_reader(
|
||||
ws_id,
|
||||
start_pos,
|
||||
self.conf.max_delta_for_fanout,
|
||||
{
|
||||
let tx = tx.clone();
|
||||
|reader| {
|
||||
tracing::info!(
|
||||
"Fanning out interpreted wal reader at {}",
|
||||
start_pos
|
||||
);
|
||||
reader
|
||||
.fanout(shard, tx, start_pos)
|
||||
.with_context(|| "Failed to fan out reader")
|
||||
}
|
||||
},
|
||||
|| {
|
||||
tracing::info!("Spawning interpreted wal reader at {}", start_pos);
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
pgb,
|
||||
wal_stream_builder,
|
||||
end_watch_view,
|
||||
shard: self.shard.unwrap(),
|
||||
pg_version,
|
||||
appname,
|
||||
};
|
||||
|
||||
let wal_stream = StreamingWalReader::new(
|
||||
wal_residence_guard,
|
||||
term,
|
||||
start_pos,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
InterpretedWalReader::spawn(
|
||||
wal_stream, start_pos, tx, shard, pg_version, &appname,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
appname,
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_lsn: start_pos,
|
||||
pgb,
|
||||
end_watch_view,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
rx,
|
||||
};
|
||||
|
||||
FutureExt::boxed(sender.run())
|
||||
} else {
|
||||
let wal_reader = StreamingWalReader::new(
|
||||
wal_residence_guard,
|
||||
term,
|
||||
start_pos,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let reader =
|
||||
InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version);
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
appname: appname.clone(),
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_lsn: start_pos,
|
||||
pgb,
|
||||
end_watch_view,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
rx,
|
||||
};
|
||||
|
||||
FutureExt::boxed(async move {
|
||||
// Sender returns an Err on all code paths.
|
||||
// If the sender finishes first, we will drop the reader future.
|
||||
// If the reader finishes first, the sender will finish too since
|
||||
// the wal sender has dropped.
|
||||
let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname));
|
||||
match res.map(|_| ()) {
|
||||
Ok(_) => unreachable!("sender finishes with Err by convention"),
|
||||
err_res => err_res,
|
||||
}
|
||||
})
|
||||
}
|
||||
Either::Right(sender.run())
|
||||
}
|
||||
};
|
||||
|
||||
@@ -679,8 +470,7 @@ impl SafekeeperPostgresHandler {
|
||||
.clone();
|
||||
info!(
|
||||
"finished streaming to {}, feedback={:?}",
|
||||
ws_state.get_addr(),
|
||||
ws_state.get_feedback(),
|
||||
ws_state.addr, ws_state.feedback,
|
||||
);
|
||||
|
||||
// Join pg backend back.
|
||||
@@ -788,18 +578,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&[
|
||||
"future",
|
||||
self.appname.as_deref().unwrap_or("safekeeper"),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
// Wait for the next portion if it is not there yet, or just
|
||||
// update our end of WAL available for sending value, we
|
||||
@@ -1035,7 +813,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use safekeeper_api::models::FullTransactionId;
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1052,13 +830,13 @@ mod tests {
|
||||
|
||||
// add to wss specified feedback setting other fields to dummy values
|
||||
fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
|
||||
let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState {
|
||||
let walsender_state = WalSenderState {
|
||||
ttid: mock_ttid(),
|
||||
addr: mock_addr(),
|
||||
conn_id: 1,
|
||||
appname: None,
|
||||
feedback,
|
||||
});
|
||||
};
|
||||
wss.slots.push(Some(walsender_state))
|
||||
}
|
||||
|
||||
|
||||
@@ -1,25 +1,20 @@
|
||||
//! Defines per timeline data stored persistently (SafeKeeperPersistentState)
|
||||
//! and its wrapper with in memory layer (SafekeeperState).
|
||||
|
||||
use std::{cmp::max, ops::Deref, time::SystemTime};
|
||||
use std::{cmp::max, ops::Deref};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::{
|
||||
membership::Configuration,
|
||||
models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse},
|
||||
ServerInfo, Term, INITIAL_TERM,
|
||||
};
|
||||
use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
control_file,
|
||||
safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION},
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION},
|
||||
timeline::TimelineError,
|
||||
wal_backup_partial::{self},
|
||||
};
|
||||
@@ -32,8 +27,6 @@ pub struct TimelinePersistentState {
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// Membership configuration.
|
||||
pub mconf: Configuration,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
@@ -65,15 +58,22 @@ pub struct TimelinePersistentState {
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
/// Eviction state of the timeline. If it's Offloaded, we should download
|
||||
/// WAL files from remote storage to serve the timeline.
|
||||
pub eviction_state: EvictionState,
|
||||
pub creation_ts: SystemTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
/// State of the local WAL files. Used to track current timeline state,
|
||||
/// that can be either WAL files are present on disk or last partial segment
|
||||
/// is offloaded to remote storage.
|
||||
@@ -87,14 +87,12 @@ pub enum EvictionState {
|
||||
}
|
||||
|
||||
impl TimelinePersistentState {
|
||||
/// commit_lsn is the same as start_lsn in the normal creaiton; see
|
||||
/// `TimelineCreateRequest` comments.`
|
||||
pub fn new(
|
||||
ttid: &TenantTimelineId,
|
||||
mconf: Configuration,
|
||||
server_info: ServerInfo,
|
||||
start_lsn: Lsn,
|
||||
peers: Vec<NodeId>,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> anyhow::Result<TimelinePersistentState> {
|
||||
if server_info.wal_seg_size == 0 {
|
||||
bail!(TimelineError::UninitializedWalSegSize(*ttid));
|
||||
@@ -104,59 +102,49 @@ impl TimelinePersistentState {
|
||||
bail!(TimelineError::UninitialinzedPgVersion(*ttid));
|
||||
}
|
||||
|
||||
if commit_lsn < start_lsn {
|
||||
if commit_lsn < local_start_lsn {
|
||||
bail!(
|
||||
"commit_lsn {} is smaller than start_lsn {}",
|
||||
"commit_lsn {} is smaller than local_start_lsn {}",
|
||||
commit_lsn,
|
||||
start_lsn
|
||||
local_start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// If we are given with init LSN, initialize term history with it. It
|
||||
// ensures that walproposer always must be able to find a common point
|
||||
// in histories; if it can't something is corrupted. Not having LSN here
|
||||
// is so far left for legacy case where timeline is created by compute
|
||||
// and LSN during creation is not known yet.
|
||||
let term_history = if commit_lsn != Lsn::INVALID {
|
||||
TermHistory(vec![TermLsn {
|
||||
term: INITIAL_TERM,
|
||||
lsn: start_lsn,
|
||||
}])
|
||||
} else {
|
||||
TermHistory::empty()
|
||||
};
|
||||
|
||||
Ok(TimelinePersistentState {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
mconf,
|
||||
acceptor_state: AcceptorState {
|
||||
term: INITIAL_TERM,
|
||||
term_history,
|
||||
term: 0,
|
||||
term_history: TermHistory::empty(),
|
||||
},
|
||||
server: server_info,
|
||||
proposer_uuid: [0; 16],
|
||||
timeline_start_lsn: start_lsn,
|
||||
local_start_lsn: start_lsn,
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn,
|
||||
commit_lsn,
|
||||
backup_lsn: start_lsn,
|
||||
peer_horizon_lsn: start_lsn,
|
||||
backup_lsn: local_start_lsn,
|
||||
peer_horizon_lsn: local_start_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(
|
||||
peers
|
||||
.iter()
|
||||
.map(|p| (*p, PersistedPeerInfo::new()))
|
||||
.collect(),
|
||||
),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: SystemTime::now(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn empty() -> Self {
|
||||
TimelinePersistentState::new(
|
||||
&TenantTimelineId::empty(),
|
||||
Configuration::empty(),
|
||||
ServerInfo {
|
||||
pg_version: 170000, /* Postgres server version (major * 10000) */
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
},
|
||||
vec![],
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
@@ -261,31 +249,6 @@ where
|
||||
current_term: after,
|
||||
})
|
||||
}
|
||||
|
||||
/// Switch into membership configuration `to` if it is higher than the
|
||||
/// current one.
|
||||
pub async fn membership_switch(
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
let before = self.mconf.clone();
|
||||
// Is switch allowed?
|
||||
if to.generation <= self.mconf.generation {
|
||||
info!(
|
||||
"ignoring request to switch membership conf to lower {}, current conf {}",
|
||||
to, self.mconf
|
||||
);
|
||||
} else {
|
||||
let mut state = self.start_change();
|
||||
state.mconf = to.clone();
|
||||
self.finish_change(&state).await?;
|
||||
info!("switched membership conf to {} from {}", to, before);
|
||||
}
|
||||
Ok(TimelineMembershipSwitchResponse {
|
||||
previous_conf: before,
|
||||
current_conf: self.mconf.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<CTRL> Deref for TimelineState<CTRL>
|
||||
|
||||
@@ -1,19 +1,13 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::receive_wal::WalAcceptor;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
|
||||
ProposerElected, SafeKeeper, TermHistory,
|
||||
};
|
||||
use crate::send_wal::EndWatch;
|
||||
use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
|
||||
use crate::state::{TimelinePersistentState, TimelineState};
|
||||
use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::remote_timeline_path;
|
||||
use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
|
||||
use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
|
||||
use tokio::fs::create_dir_all;
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -113,59 +107,4 @@ impl Env {
|
||||
);
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
// This will be dead code when building a non-benchmark target with the
|
||||
// benchmarking feature enabled.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn write_wal(
|
||||
tli: Arc<Timeline>,
|
||||
start_lsn: Lsn,
|
||||
msg_size: usize,
|
||||
msg_count: usize,
|
||||
) -> anyhow::Result<EndWatch> {
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
|
||||
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
|
||||
|
||||
let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx());
|
||||
|
||||
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
|
||||
|
||||
let prefix = c"p";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(msg_size >= prefixlen);
|
||||
let message = vec![0; msg_size - prefixlen];
|
||||
|
||||
let walgen =
|
||||
&mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
|
||||
for _ in 0..msg_count {
|
||||
let (lsn, record) = walgen.next().unwrap();
|
||||
|
||||
let req = AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: 1,
|
||||
term_start_lsn: start_lsn,
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: lsn,
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
};
|
||||
|
||||
let end_lsn = req.h.end_lsn;
|
||||
|
||||
let msg = ProposerAcceptorMessage::AppendRequest(req);
|
||||
msg_tx.send(msg).await?;
|
||||
while let Some(reply) = reply_rx.recv().await {
|
||||
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
|
||||
if resp.flush_lsn >= end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(end_watch)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,10 +4,7 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use remote_storage::RemotePath;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::{
|
||||
PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse,
|
||||
};
|
||||
use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::fs::{self};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -35,7 +32,7 @@ use crate::control_file;
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
|
||||
use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues};
|
||||
use crate::send_wal::WalSenders;
|
||||
use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
|
||||
use crate::timeline_guard::ResidenceGuard;
|
||||
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
||||
@@ -191,13 +188,6 @@ impl StateSK {
|
||||
self.state_mut().term_bump(to).await
|
||||
}
|
||||
|
||||
pub async fn membership_switch(
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
self.state_mut().membership_switch(to).await
|
||||
}
|
||||
|
||||
/// Close open WAL files to release FDs.
|
||||
fn close_wal_store(&mut self) {
|
||||
if let StateSK::Loaded(sk) = self {
|
||||
@@ -712,22 +702,16 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
let WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter,
|
||||
last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
} = self.walsenders.info_for_metrics();
|
||||
|
||||
let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
|
||||
let state = self.read_shared_state().await;
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
ps_feedback_count: ps_feedback_counter,
|
||||
ps_feedback_count,
|
||||
last_ps_feedback,
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
|
||||
interpreted_wal_reader_tasks,
|
||||
epoch_start_lsn: state.sk.term_start_lsn(),
|
||||
mem_state: state.sk.state().inmem.clone(),
|
||||
persisted_state: TimelinePersistentState::clone(state.sk.state()),
|
||||
@@ -746,7 +730,7 @@ impl Timeline {
|
||||
debug_dump::Memory {
|
||||
is_cancelled: self.is_cancelled(),
|
||||
peers_info_len: state.peers_info.0.len(),
|
||||
walsenders: self.walsenders.get_all_public(),
|
||||
walsenders: self.walsenders.get_all(),
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
@@ -784,14 +768,6 @@ impl Timeline {
|
||||
state.sk.term_bump(to).await
|
||||
}
|
||||
|
||||
pub async fn membership_switch(
|
||||
self: &Arc<Self>,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
state.sk.membership_switch(to).await
|
||||
}
|
||||
|
||||
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
|
||||
async fn do_wal_residence_guard(
|
||||
self: &Arc<Self>,
|
||||
|
||||
@@ -12,7 +12,6 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::ServerInfo;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -215,10 +214,9 @@ impl GlobalTimelines {
|
||||
pub(crate) async fn create(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
mconf: Configuration,
|
||||
server_info: ServerInfo,
|
||||
start_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, _, _) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
@@ -241,7 +239,8 @@ impl GlobalTimelines {
|
||||
|
||||
// TODO: currently we create only cfile. It would be reasonable to
|
||||
// immediately initialize first WAL segment as well.
|
||||
let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?;
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
|
||||
let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||
Ok(timeline)
|
||||
|
||||
@@ -1,16 +1,34 @@
|
||||
use std::{
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_stream::try_stream;
|
||||
use bytes::Bytes;
|
||||
use futures::{stream::BoxStream, Stream, StreamExt};
|
||||
use futures::Stream;
|
||||
use postgres_backend::CopyStreamHandlerEnd;
|
||||
use safekeeper_api::Term;
|
||||
use std::time::Duration;
|
||||
use tokio::time::timeout;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader};
|
||||
use safekeeper_api::Term;
|
||||
use crate::{
|
||||
send_wal::{EndWatch, WalSenderGuard},
|
||||
timeline::WalResidentTimeline,
|
||||
};
|
||||
|
||||
pub(crate) struct WalReaderStreamBuilder {
|
||||
pub(crate) tli: WalResidentTimeline,
|
||||
pub(crate) start_pos: Lsn,
|
||||
pub(crate) end_pos: Lsn,
|
||||
pub(crate) term: Option<Term>,
|
||||
pub(crate) end_watch: EndWatch,
|
||||
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
|
||||
}
|
||||
|
||||
impl WalReaderStreamBuilder {
|
||||
pub(crate) fn start_pos(&self) -> Lsn {
|
||||
self.start_pos
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub(crate) struct WalBytes {
|
||||
/// Raw PG WAL
|
||||
pub(crate) wal: Bytes,
|
||||
@@ -26,270 +44,106 @@ pub(crate) struct WalBytes {
|
||||
pub(crate) available_wal_end_lsn: Lsn,
|
||||
}
|
||||
|
||||
struct PositionedWalReader {
|
||||
start: Lsn,
|
||||
end: Lsn,
|
||||
reader: Option<WalReader>,
|
||||
}
|
||||
|
||||
/// A streaming WAL reader wrapper which can be reset while running
|
||||
pub(crate) struct StreamingWalReader {
|
||||
stream: BoxStream<'static, WalOrReset>,
|
||||
start_changed_tx: tokio::sync::watch::Sender<Lsn>,
|
||||
}
|
||||
|
||||
pub(crate) enum WalOrReset {
|
||||
Wal(anyhow::Result<WalBytes>),
|
||||
Reset(Lsn),
|
||||
}
|
||||
|
||||
impl WalOrReset {
|
||||
pub(crate) fn get_wal(self) -> Option<anyhow::Result<WalBytes>> {
|
||||
match self {
|
||||
WalOrReset::Wal(wal) => Some(wal),
|
||||
WalOrReset::Reset(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StreamingWalReader {
|
||||
pub(crate) fn new(
|
||||
tli: WalResidentTimeline,
|
||||
term: Option<Term>,
|
||||
start: Lsn,
|
||||
end: Lsn,
|
||||
end_watch: EndWatch,
|
||||
impl WalReaderStreamBuilder {
|
||||
/// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
|
||||
/// The stream terminates when the receiver (pageserver) is fully caught up
|
||||
/// and there's no active computes.
|
||||
pub(crate) async fn build(
|
||||
self,
|
||||
buffer_size: usize,
|
||||
) -> Self {
|
||||
let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start);
|
||||
|
||||
let state = WalReaderStreamState {
|
||||
) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
|
||||
// TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
|
||||
// We can make the raw WAL sender use this stream too and remove the duplication.
|
||||
let Self {
|
||||
tli,
|
||||
wal_reader: PositionedWalReader {
|
||||
start,
|
||||
end,
|
||||
reader: None,
|
||||
},
|
||||
mut start_pos,
|
||||
mut end_pos,
|
||||
term,
|
||||
end_watch,
|
||||
buffer: vec![0; buffer_size],
|
||||
buffer_size,
|
||||
};
|
||||
mut end_watch,
|
||||
wal_sender_guard,
|
||||
} = self;
|
||||
let mut wal_reader = tli.get_walreader(start_pos).await?;
|
||||
let mut buffer = vec![0; buffer_size];
|
||||
|
||||
// When a change notification is received while polling the internal
|
||||
// reader, stop polling the read future and service the change.
|
||||
let stream = futures::stream::unfold(
|
||||
(state, start_changed_rx),
|
||||
|(mut state, mut rx)| async move {
|
||||
let wal_or_reset = tokio::select! {
|
||||
read_res = state.read() => { WalOrReset::Wal(read_res) },
|
||||
changed_res = rx.changed() => {
|
||||
if changed_res.is_err() {
|
||||
return None;
|
||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
|
||||
Ok(try_stream! {
|
||||
loop {
|
||||
let have_something_to_send = end_pos > start_pos;
|
||||
|
||||
if !have_something_to_send {
|
||||
// wait for lsn
|
||||
let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
|
||||
match res {
|
||||
Ok(ok) => {
|
||||
end_pos = ok?;
|
||||
},
|
||||
Err(_) => {
|
||||
if let EndWatch::Commit(_) = end_watch {
|
||||
if let Some(remote_consistent_lsn) = wal_sender_guard
|
||||
.walsenders()
|
||||
.get_ws_remote_consistent_lsn(wal_sender_guard.id())
|
||||
{
|
||||
if tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Stop streaming if the receivers are caught up and
|
||||
// there's no active compute. This causes the loop in
|
||||
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
|
||||
// to exit and terminate the WAL stream.
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
let new_start_pos = rx.borrow_and_update();
|
||||
WalOrReset::Reset(*new_start_pos)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
assert!(
|
||||
end_pos > start_pos,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
|
||||
// try to send as much as available, capped by the buffer size
|
||||
let mut chunk_end_pos = start_pos + buffer_size as u64;
|
||||
// if we went behind available WAL, back off
|
||||
if chunk_end_pos >= end_pos {
|
||||
chunk_end_pos = end_pos;
|
||||
} else {
|
||||
// If sending not up to end pos, round down to page boundary to
|
||||
// avoid breaking WAL record not at page boundary, as protocol
|
||||
// demands. See walsender.c (XLogSendPhysical).
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
|
||||
let buffer = &mut buffer[..send_size];
|
||||
let send_size: usize;
|
||||
{
|
||||
// If uncommitted part is being pulled, check that the term is
|
||||
// still the expected one.
|
||||
let _term_guard = if let Some(t) = term {
|
||||
Some(tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// Read WAL into buffer. send_size can be additionally capped to
|
||||
// segment boundary here.
|
||||
send_size = wal_reader.read(buffer).await?
|
||||
};
|
||||
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
|
||||
|
||||
yield WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: start_pos,
|
||||
wal_end_lsn: start_pos + send_size as u64,
|
||||
available_wal_end_lsn: end_pos
|
||||
};
|
||||
|
||||
if let WalOrReset::Reset(lsn) = wal_or_reset {
|
||||
state.wal_reader.start = lsn;
|
||||
state.wal_reader.reader = None;
|
||||
}
|
||||
|
||||
Some((wal_or_reset, (state, rx)))
|
||||
},
|
||||
)
|
||||
.boxed();
|
||||
|
||||
Self {
|
||||
stream,
|
||||
start_changed_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the stream to a given position.
|
||||
pub(crate) async fn reset(&mut self, start: Lsn) {
|
||||
self.start_changed_tx.send(start).unwrap();
|
||||
while let Some(wal_or_reset) = self.stream.next().await {
|
||||
match wal_or_reset {
|
||||
WalOrReset::Reset(at) => {
|
||||
// Stream confirmed the reset.
|
||||
// There may only one ongoing reset at any given time,
|
||||
// hence the assertion.
|
||||
assert_eq!(at, start);
|
||||
break;
|
||||
}
|
||||
WalOrReset::Wal(_) => {
|
||||
// Ignore wal generated before reset was handled
|
||||
}
|
||||
start_pos += send_size as u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for StreamingWalReader {
|
||||
type Item = WalOrReset;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
Pin::new(&mut self.stream).poll_next(cx)
|
||||
}
|
||||
}
|
||||
|
||||
struct WalReaderStreamState {
|
||||
tli: WalResidentTimeline,
|
||||
wal_reader: PositionedWalReader,
|
||||
term: Option<Term>,
|
||||
end_watch: EndWatch,
|
||||
buffer: Vec<u8>,
|
||||
buffer_size: usize,
|
||||
}
|
||||
|
||||
impl WalReaderStreamState {
|
||||
async fn read(&mut self) -> anyhow::Result<WalBytes> {
|
||||
// Create reader if needed
|
||||
if self.wal_reader.reader.is_none() {
|
||||
self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?);
|
||||
}
|
||||
|
||||
let have_something_to_send = self.wal_reader.end > self.wal_reader.start;
|
||||
if !have_something_to_send {
|
||||
tracing::debug!(
|
||||
"Waiting for wal: start={}, end={}",
|
||||
self.wal_reader.end,
|
||||
self.wal_reader.start
|
||||
);
|
||||
self.wal_reader.end = self
|
||||
.end_watch
|
||||
.wait_for_lsn(self.wal_reader.start, self.term)
|
||||
.await?;
|
||||
tracing::debug!(
|
||||
"Done waiting for wal: start={}, end={}",
|
||||
self.wal_reader.end,
|
||||
self.wal_reader.start
|
||||
);
|
||||
}
|
||||
|
||||
assert!(
|
||||
self.wal_reader.end > self.wal_reader.start,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
|
||||
// Calculate chunk size
|
||||
let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64;
|
||||
if chunk_end_pos >= self.wal_reader.end {
|
||||
chunk_end_pos = self.wal_reader.end;
|
||||
} else {
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize;
|
||||
let buffer = &mut self.buffer[..send_size];
|
||||
|
||||
// Read WAL
|
||||
let send_size = {
|
||||
let _term_guard = if let Some(t) = self.term {
|
||||
Some(self.tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
self.wal_reader
|
||||
.reader
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.read(buffer)
|
||||
.await?
|
||||
};
|
||||
|
||||
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
|
||||
let result = WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: self.wal_reader.start,
|
||||
wal_end_lsn: self.wal_reader.start + send_size as u64,
|
||||
available_wal_end_lsn: self.wal_reader.end,
|
||||
};
|
||||
|
||||
self.wal_reader.start += send_size as u64;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use futures::StreamExt;
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_streaming_wal_reader_reset() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
tracing::info!("Doing first round of reads ...");
|
||||
|
||||
let mut streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let mut before_reset = Vec::new();
|
||||
while let Some(wor) = streaming_wal_reader.next().await {
|
||||
let wal = wor.get_wal().unwrap().unwrap();
|
||||
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
|
||||
before_reset.push(wal);
|
||||
|
||||
if stop {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Resetting the WAL stream ...");
|
||||
|
||||
streaming_wal_reader.reset(start_lsn).await;
|
||||
|
||||
tracing::info!("Doing second round of reads ...");
|
||||
|
||||
let mut after_reset = Vec::new();
|
||||
while let Some(wor) = streaming_wal_reader.next().await {
|
||||
let wal = wor.get_wal().unwrap().unwrap();
|
||||
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
|
||||
after_reset.push(wal);
|
||||
|
||||
if stop {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(before_reset, after_reset);
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,15 +15,13 @@ use desim::{
|
||||
};
|
||||
use http::Uri;
|
||||
use safekeeper::{
|
||||
safekeeper::{
|
||||
ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
|
||||
},
|
||||
safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
|
||||
state::{TimelinePersistentState, TimelineState},
|
||||
timeline::TimelineError,
|
||||
wal_storage::Storage,
|
||||
SafeKeeperConf,
|
||||
};
|
||||
use safekeeper_api::{membership::Configuration, ServerInfo};
|
||||
use safekeeper_api::ServerInfo;
|
||||
use tracing::{debug, info_span, warn};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -98,13 +96,8 @@ impl GlobalMap {
|
||||
let commit_lsn = Lsn::INVALID;
|
||||
let local_start_lsn = Lsn::INVALID;
|
||||
|
||||
let state = TimelinePersistentState::new(
|
||||
&ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
commit_lsn,
|
||||
local_start_lsn,
|
||||
)?;
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
|
||||
let disk_timeline = self.disk.put_state(&ttid, state);
|
||||
let control_store = DiskStateStorage::new(disk_timeline.clone());
|
||||
@@ -180,8 +173,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
|
||||
control_file_save_interval: Duration::from_secs(1),
|
||||
partial_backup_concurrency: 1,
|
||||
eviction_min_resident: Duration::ZERO,
|
||||
wal_reader_fanout: false,
|
||||
max_delta_for_fanout: None,
|
||||
};
|
||||
|
||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
||||
@@ -287,7 +278,7 @@ impl ConnState {
|
||||
bail!("finished processing START_REPLICATION")
|
||||
}
|
||||
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
debug!("got msg: {:?}", msg);
|
||||
self.process(msg, global)
|
||||
} else {
|
||||
|
||||
@@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" }
|
||||
utils = { path = "../libs/utils/" }
|
||||
metrics = { path = "../libs/metrics/" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled';
|
||||
UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause';
|
||||
@@ -1,2 +0,0 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause';
|
||||
UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled';
|
||||
@@ -112,7 +112,7 @@ impl TenantShardDrain {
|
||||
}
|
||||
}
|
||||
|
||||
match tenant_shard.preferred_secondary(scheduler) {
|
||||
match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
|
||||
Some(node) => Some(node),
|
||||
None => {
|
||||
tracing::warn!(
|
||||
|
||||
@@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::controller_api::{
|
||||
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
|
||||
@@ -653,10 +653,6 @@ async fn handle_tenant_list(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let limit: Option<usize> = parse_query_param(&req, "limit")?;
|
||||
let start_after: Option<TenantId> = parse_query_param(&req, "start_after")?;
|
||||
tracing::info!("start_after: {:?}", start_after);
|
||||
|
||||
match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
@@ -664,7 +660,7 @@ async fn handle_tenant_list(
|
||||
ForwardOutcome::NotForwarded(_req) => {}
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_list(limit, start_after))
|
||||
json_response(StatusCode::OK, service.tenant_list())
|
||||
}
|
||||
|
||||
async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -1305,35 +1301,6 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Sets the scheduling policy of the specified safekeeper
|
||||
async fn handle_safekeeper_scheduling_policy(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let body = json_request::<SafekeeperSchedulingPolicyRequest>(&mut req).await?;
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
state
|
||||
.service
|
||||
.set_safekeeper_scheduling_policy(id, body.scheduling_policy)
|
||||
.await?;
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::empty())
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
@@ -1902,18 +1869,7 @@ pub fn make_router(
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(
|
||||
r,
|
||||
handle_upsert_safekeeper,
|
||||
RequestName("v1_safekeeper_post"),
|
||||
)
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id/scheduling_policy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_safekeeper_scheduling_policy,
|
||||
RequestName("v1_safekeeper_status"),
|
||||
)
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
|
||||
@@ -53,16 +53,6 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// How many shards are not scheduled into their preferred AZ
|
||||
pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
|
||||
|
||||
/// How many shard locations (secondary or attached) on each node
|
||||
pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many _attached_ shard locations on each node
|
||||
pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
|
||||
/// preferred AZ)
|
||||
pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many shards would like to reconcile but were blocked by concurrency limits
|
||||
pub(crate) storage_controller_pending_reconciles: measured::Gauge,
|
||||
|
||||
@@ -142,15 +132,6 @@ impl Default for StorageControllerMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup, Clone)]
|
||||
#[label(set = NodeLabelGroupSet)]
|
||||
pub(crate) struct NodeLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) az: &'a str,
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) node_id: &'a str,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = ReconcileCompleteLabelGroupSet)]
|
||||
pub(crate) struct ReconcileCompleteLabelGroup {
|
||||
|
||||
@@ -299,7 +299,6 @@ impl Node {
|
||||
id: self.id,
|
||||
availability: self.availability.clone().into(),
|
||||
scheduling: self.scheduling,
|
||||
availability_zone_id: self.availability_zone_id.0.clone(),
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
|
||||
@@ -708,11 +708,10 @@ impl Persistence {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified)
|
||||
pub(crate) async fn set_tenant_shard_preferred_azs(
|
||||
&self,
|
||||
preferred_azs: Vec<(TenantShardId, Option<AvailabilityZone>)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
|
||||
preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
|
||||
@@ -723,7 +722,7 @@ impl Persistence {
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
|
||||
.set(preferred_az_id.eq(preferred_az.0.clone()))
|
||||
.execute(conn)?;
|
||||
|
||||
if updated == 1 {
|
||||
@@ -1104,37 +1103,6 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id_: i64,
|
||||
scheduling_policy_: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
#[derive(Insertable, AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
struct UpdateSkSchedulingPolicy<'a> {
|
||||
id: i64,
|
||||
scheduling_policy: &'a str,
|
||||
}
|
||||
let scheduling_policy_ = String::from(scheduling_policy_);
|
||||
|
||||
let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
|
||||
.set(scheduling_policy.eq(scheduling_policy_))
|
||||
.execute(conn)?;
|
||||
|
||||
if rows_affected != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of rows ({rows_affected})",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
|
||||
@@ -826,21 +826,7 @@ impl Reconciler {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(ReconcileError::Cancel);
|
||||
}
|
||||
// We only try to configure secondary locations if the node is available. This does
|
||||
// not stop us succeeding with the reconcile, because our core goal is to make the
|
||||
// shard _available_ (the attached location), and configuring secondary locations
|
||||
// can be done lazily when the node becomes available (via background reconciliation).
|
||||
if node.is_available() {
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
} else {
|
||||
// If the node is unavailable, we skip and consider the reconciliation successful: this
|
||||
// is a common case where a pageserver is marked unavailable: we demote a location on
|
||||
// that unavailable pageserver to secondary.
|
||||
tracing::info!("Skipping configuring secondary location {node}, it is unavailable");
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||
}
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
}
|
||||
|
||||
// The condition below identifies a detach. We must have no attached intent and
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
|
||||
use crate::{node::Node, tenant_shard::TenantShard};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
|
||||
use serde::Serialize;
|
||||
@@ -32,9 +32,6 @@ pub(crate) struct SchedulerNode {
|
||||
shard_count: usize,
|
||||
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
attached_shard_count: usize,
|
||||
/// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
|
||||
/// is in their preferred AZ (i.e. this is their 'home' location)
|
||||
home_shard_count: usize,
|
||||
/// Availability zone id in which the node resides
|
||||
az: AvailabilityZone,
|
||||
|
||||
@@ -50,12 +47,6 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Option<Self>;
|
||||
|
||||
/// Return a score that drops any components based on node utilization: this is useful
|
||||
/// for finding scores for scheduling optimisation, when we want to avoid rescheduling
|
||||
/// shards due to e.g. disk usage, to avoid flapping.
|
||||
fn for_optimization(&self) -> Self;
|
||||
|
||||
fn is_overloaded(&self) -> bool;
|
||||
fn node_id(&self) -> NodeId;
|
||||
}
|
||||
@@ -145,13 +136,17 @@ impl PartialOrd for SecondaryAzMatch {
|
||||
/// Ordering is given by member declaration order (top to bottom).
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub(crate) struct NodeAttachmentSchedulingScore {
|
||||
/// The number of shards belonging to the tenant currently being
|
||||
/// scheduled that are attached to this node.
|
||||
affinity_score: AffinityScore,
|
||||
/// Flag indicating whether this node matches the preferred AZ
|
||||
/// of the shard. For equal affinity scores, nodes in the matching AZ
|
||||
/// are considered first.
|
||||
az_match: AttachmentAzMatch,
|
||||
/// The number of shards belonging to the tenant currently being
|
||||
/// scheduled that are attached to this node.
|
||||
affinity_score: AffinityScore,
|
||||
/// Size of [`ScheduleContext::attached_nodes`] for the current node.
|
||||
/// This normally tracks the number of attached shards belonging to the
|
||||
/// tenant being scheduled that are already on this node.
|
||||
attached_shards_in_context: usize,
|
||||
/// Utilisation score that combines shard count and disk utilisation
|
||||
utilization_score: u64,
|
||||
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
||||
@@ -182,25 +177,13 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE),
|
||||
az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
|
||||
attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
|
||||
utilization_score: utilization.cached_score(),
|
||||
total_attached_shard_count: node.attached_shard_count,
|
||||
node_id: *node_id,
|
||||
})
|
||||
}
|
||||
|
||||
/// For use in scheduling optimisation, where we only want to consider the aspects
|
||||
/// of the score that can only be resolved by moving things (such as inter-shard affinity
|
||||
/// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
|
||||
/// can fluctuate for other reasons)
|
||||
fn for_optimization(&self) -> Self {
|
||||
Self {
|
||||
utilization_score: 0,
|
||||
total_attached_shard_count: 0,
|
||||
node_id: NodeId(0),
|
||||
..*self
|
||||
}
|
||||
}
|
||||
|
||||
fn is_overloaded(&self) -> bool {
|
||||
PageserverUtilization::is_overloaded(self.utilization_score)
|
||||
}
|
||||
@@ -225,9 +208,9 @@ pub(crate) struct NodeSecondarySchedulingScore {
|
||||
affinity_score: AffinityScore,
|
||||
/// Utilisation score that combines shard count and disk utilisation
|
||||
utilization_score: u64,
|
||||
/// Anti-affinity with other non-home locations: this gives the behavior that secondaries
|
||||
/// will spread out across the nodes in an AZ.
|
||||
total_non_home_shard_count: usize,
|
||||
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
||||
/// acts as an anti-affinity between attached shards.
|
||||
total_attached_shard_count: usize,
|
||||
/// Convenience to make selection deterministic in tests and empty systems
|
||||
node_id: NodeId,
|
||||
}
|
||||
@@ -254,20 +237,11 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE),
|
||||
utilization_score: utilization.cached_score(),
|
||||
total_non_home_shard_count: (node.shard_count - node.home_shard_count),
|
||||
total_attached_shard_count: node.attached_shard_count,
|
||||
node_id: *node_id,
|
||||
})
|
||||
}
|
||||
|
||||
fn for_optimization(&self) -> Self {
|
||||
Self {
|
||||
utilization_score: 0,
|
||||
total_non_home_shard_count: 0,
|
||||
node_id: NodeId(0),
|
||||
..*self
|
||||
}
|
||||
}
|
||||
|
||||
fn is_overloaded(&self) -> bool {
|
||||
PageserverUtilization::is_overloaded(self.utilization_score)
|
||||
}
|
||||
@@ -319,10 +293,6 @@ impl AffinityScore {
|
||||
pub(crate) fn inc(&mut self) {
|
||||
self.0 += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn dec(&mut self) {
|
||||
self.0 -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add for AffinityScore {
|
||||
@@ -354,6 +324,9 @@ pub(crate) struct ScheduleContext {
|
||||
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
|
||||
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
|
||||
|
||||
/// Specifically how many _attached_ locations are on each node
|
||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
||||
|
||||
pub(crate) mode: ScheduleMode,
|
||||
}
|
||||
|
||||
@@ -361,6 +334,7 @@ impl ScheduleContext {
|
||||
pub(crate) fn new(mode: ScheduleMode) -> Self {
|
||||
Self {
|
||||
nodes: HashMap::new(),
|
||||
attached_nodes: HashMap::new(),
|
||||
mode,
|
||||
}
|
||||
}
|
||||
@@ -374,31 +348,25 @@ impl ScheduleContext {
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove `shard`'s contributions to this context. This is useful when considering scheduling
|
||||
/// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
|
||||
pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
|
||||
let mut new_context = self.clone();
|
||||
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
if let Some(score) = new_context.nodes.get_mut(attached) {
|
||||
score.dec();
|
||||
}
|
||||
}
|
||||
|
||||
for secondary in shard.intent.get_secondary() {
|
||||
if let Some(score) = new_context.nodes.get_mut(secondary) {
|
||||
score.dec();
|
||||
}
|
||||
}
|
||||
|
||||
new_context
|
||||
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
|
||||
let entry = self.attached_nodes.entry(node_id).or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
|
||||
self.nodes
|
||||
.get(&node_id)
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE)
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
|
||||
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
|
||||
}
|
||||
|
||||
/// For test, track the sum of AffinityScore values, which is effectively how many
|
||||
/// attached or secondary locations have been registered with this context.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn location_count(&self) -> usize {
|
||||
self.nodes.values().map(|i| i.0).sum()
|
||||
pub(crate) fn attach_count(&self) -> usize {
|
||||
self.attached_nodes.values().sum()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,7 +388,6 @@ impl Scheduler {
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
home_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
},
|
||||
@@ -448,7 +415,6 @@ impl Scheduler {
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
home_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
},
|
||||
@@ -461,9 +427,6 @@ impl Scheduler {
|
||||
Some(node) => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
if Some(&node.az) == shard.preferred_az() {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
@@ -475,12 +438,7 @@ impl Scheduler {
|
||||
|
||||
for node_id in shard.intent.get_secondary() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => {
|
||||
node.shard_count += 1;
|
||||
if Some(&node.az) == shard.preferred_az() {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
@@ -524,20 +482,13 @@ impl Scheduler {
|
||||
///
|
||||
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
||||
/// [`Self::new`] or [`Self::node_upsert`])
|
||||
pub(crate) fn update_node_ref_counts(
|
||||
&mut self,
|
||||
node_id: NodeId,
|
||||
preferred_az: Option<&AvailabilityZone>,
|
||||
update: RefCountUpdate,
|
||||
) {
|
||||
pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
};
|
||||
|
||||
let is_home_az = Some(&node.az) == preferred_az;
|
||||
|
||||
match update {
|
||||
RefCountUpdate::PromoteSecondary => {
|
||||
node.attached_shard_count += 1;
|
||||
@@ -545,31 +496,19 @@ impl Scheduler {
|
||||
RefCountUpdate::Attach => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
RefCountUpdate::Detach => {
|
||||
node.shard_count -= 1;
|
||||
node.attached_shard_count -= 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count -= 1;
|
||||
}
|
||||
}
|
||||
RefCountUpdate::DemoteAttached => {
|
||||
node.attached_shard_count -= 1;
|
||||
}
|
||||
RefCountUpdate::AddSecondary => {
|
||||
node.shard_count += 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
RefCountUpdate::RemoveSecondary => {
|
||||
node.shard_count -= 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -655,7 +594,6 @@ impl Scheduler {
|
||||
entry.insert(SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
home_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
});
|
||||
@@ -669,20 +607,33 @@ impl Scheduler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate a single node's score, used in optimizer logic to compare specific
|
||||
/// nodes' scores.
|
||||
pub(crate) fn compute_node_score<Score>(
|
||||
&mut self,
|
||||
node_id: NodeId,
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Option<Score>
|
||||
where
|
||||
Score: NodeSchedulingScore,
|
||||
{
|
||||
self.nodes
|
||||
.get_mut(&node_id)
|
||||
.and_then(|node| Score::generate(&node_id, node, preferred_az, context))
|
||||
/// Where we have several nodes to choose from, for example when picking a secondary location
|
||||
/// to promote to an attached location, this method may be used to pick the best choice based
|
||||
/// on the scheduler's knowledge of utilization and availability.
|
||||
///
|
||||
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
|
||||
/// caller can pick a node some other way.
|
||||
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
|
||||
if nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// TODO: When the utilization score returned by the pageserver becomes meaningful,
|
||||
// schedule based on that instead of the shard count.
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| !matches!(n.may_schedule, MaySchedule::No))
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
.max_by_key(|(_n, may_schedule)| *may_schedule);
|
||||
|
||||
// If even the preferred node has may_schedule==false, return None
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
/// Compute a schedulling score for each node that the scheduler knows of
|
||||
@@ -776,7 +727,7 @@ impl Scheduler {
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
||||
scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
|
||||
);
|
||||
);
|
||||
}
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
@@ -792,74 +743,47 @@ impl Scheduler {
|
||||
}
|
||||
|
||||
/// For choosing which AZ to schedule a new shard into, use this. It will return the
|
||||
/// AZ with the the lowest number of shards currently scheduled in this AZ as their home
|
||||
/// location.
|
||||
/// AZ with the lowest median utilization.
|
||||
///
|
||||
/// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
|
||||
/// node, because while tenants start out single sharded, when they grow and undergo
|
||||
/// shard-split, they will occupy space on many nodes within an AZ. It is important
|
||||
/// that we pick the AZ in a way that balances this _future_ load.
|
||||
/// shard-split, they will occupy space on many nodes within an AZ.
|
||||
///
|
||||
/// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
|
||||
/// nodes' utilization scores.
|
||||
/// We use median rather than total free space or mean utilization, because
|
||||
/// we wish to avoid preferring AZs that have low-load nodes resulting from
|
||||
/// recent replacements.
|
||||
///
|
||||
/// The practical result is that we will pick an AZ based on its median node, and
|
||||
/// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
|
||||
pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
|
||||
if self.nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct AzScore {
|
||||
home_shard_count: usize,
|
||||
scheduleable: bool,
|
||||
let mut scores_by_az = HashMap::new();
|
||||
for (node_id, node) in &self.nodes {
|
||||
let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
|
||||
let score = match &node.may_schedule {
|
||||
MaySchedule::Yes(utilization) => utilization.score(),
|
||||
MaySchedule::No => PageserverUtilization::full().score(),
|
||||
};
|
||||
az_scores.push((node_id, node, score));
|
||||
}
|
||||
|
||||
let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
|
||||
for node in self.nodes.values() {
|
||||
let az = azs.entry(&node.az).or_default();
|
||||
az.home_shard_count += node.home_shard_count;
|
||||
az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
|
||||
// Sort by utilization. Also include the node ID to break ties.
|
||||
for scores in scores_by_az.values_mut() {
|
||||
scores.sort_by_key(|i| (i.2, i.0));
|
||||
}
|
||||
|
||||
// If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
|
||||
// all nodes are overloaded or otherwise unschedulable).
|
||||
if azs.values().any(|i| i.scheduleable) {
|
||||
azs.retain(|_, i| i.scheduleable);
|
||||
}
|
||||
|
||||
// Find the AZ with the lowest number of shards currently allocated
|
||||
Some(
|
||||
azs.into_iter()
|
||||
.min_by_key(|i| (i.1.home_shard_count, i.0))
|
||||
.unwrap()
|
||||
.0
|
||||
.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
|
||||
self.nodes.get(node_id).map(|n| n.az.clone())
|
||||
}
|
||||
|
||||
/// For use when choosing a preferred secondary location: filter out nodes that are not
|
||||
/// available, and gather their AZs.
|
||||
pub(crate) fn filter_usable_nodes(
|
||||
&self,
|
||||
nodes: &[NodeId],
|
||||
) -> Vec<(NodeId, Option<AvailabilityZone>)> {
|
||||
nodes
|
||||
let mut median_by_az = scores_by_az
|
||||
.iter()
|
||||
.filter_map(|node_id| {
|
||||
let node = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.expect("Referenced nodes always exist");
|
||||
if matches!(node.may_schedule, MaySchedule::Yes(_)) {
|
||||
Some((*node_id, Some(node.az.clone())))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
.map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
|
||||
.collect::<Vec<_>>();
|
||||
// Sort by utilization. Also include the AZ to break ties.
|
||||
median_by_az.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
// Return the AZ with the lowest median utilization
|
||||
Some(median_by_az.first().unwrap().0.clone())
|
||||
}
|
||||
|
||||
/// Unit test access to internal state
|
||||
@@ -872,33 +796,6 @@ impl Scheduler {
|
||||
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().attached_shard_count
|
||||
}
|
||||
|
||||
/// Some metrics that we only calculate periodically: this is simpler than
|
||||
/// rigorously updating them on every change.
|
||||
pub(crate) fn update_metrics(&self) {
|
||||
for (node_id, node) in &self.nodes {
|
||||
let node_id_str = format!("{}", node_id);
|
||||
let label_group = NodeLabelGroup {
|
||||
az: &node.az.0,
|
||||
node_id: &node_id_str,
|
||||
};
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_shards
|
||||
.set(label_group.clone(), node.shard_count as i64);
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_attached_shards
|
||||
.set(label_group.clone(), node.attached_shard_count as i64);
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_home_shards
|
||||
.set(label_group.clone(), node.home_shard_count as i64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -946,14 +843,7 @@ pub(crate) mod test_utils {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::{
|
||||
controller_api::NodeAvailability, models::utilization::test_utilization,
|
||||
shard::ShardIdentity,
|
||||
};
|
||||
use utils::{
|
||||
id::TenantId,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -963,8 +853,8 @@ mod tests {
|
||||
let nodes = test_utils::make_test_nodes(2, &[]);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut t1_intent = IntentState::new(None);
|
||||
let mut t2_intent = IntentState::new(None);
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
|
||||
let context = ScheduleContext::default();
|
||||
|
||||
@@ -1040,7 +930,7 @@ mod tests {
|
||||
let scheduled = scheduler
|
||||
.schedule_shard::<AttachedShardTag>(&[], &None, context)
|
||||
.unwrap();
|
||||
let mut intent = IntentState::new(None);
|
||||
let mut intent = IntentState::new();
|
||||
intent.set_attached(scheduler, Some(scheduled));
|
||||
scheduled_intents.push(intent);
|
||||
assert_eq!(scheduled, expect_node);
|
||||
@@ -1173,7 +1063,7 @@ mod tests {
|
||||
let scheduled = scheduler
|
||||
.schedule_shard::<Tag>(&[], &preferred_az, context)
|
||||
.unwrap();
|
||||
let mut intent = IntentState::new(preferred_az.clone());
|
||||
let mut intent = IntentState::new();
|
||||
intent.set_attached(scheduler, Some(scheduled));
|
||||
scheduled_intents.push(intent);
|
||||
assert_eq!(scheduled, expect_node);
|
||||
@@ -1199,9 +1089,9 @@ mod tests {
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
|
||||
// Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
|
||||
assert_scheduler_chooses::<AttachedShardTag>(
|
||||
NodeId(1),
|
||||
NodeId(2),
|
||||
Some(az_a_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
@@ -1217,6 +1107,26 @@ mod tests {
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Avoid nodes in "az-b" for the secondary location.
|
||||
// Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
|
||||
assert_scheduler_chooses::<SecondaryShardTag>(
|
||||
NodeId(1),
|
||||
Some(az_b_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Avoid nodes in "az-b" for the secondary location.
|
||||
// Node 3 has lower affinity score than 1, so prefer that.
|
||||
assert_scheduler_chooses::<SecondaryShardTag>(
|
||||
NodeId(3),
|
||||
Some(az_b_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
for mut intent in scheduled_intents {
|
||||
intent.clear(&mut scheduler);
|
||||
}
|
||||
@@ -1240,292 +1150,34 @@ mod tests {
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
/// Force the `home_shard_count` of a node directly: this is the metric used
|
||||
/// by the scheduler when picking AZs.
|
||||
fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
|
||||
let node = scheduler.nodes.get_mut(&node_id).unwrap();
|
||||
node.home_shard_count = shard_count;
|
||||
/// Force the utilization of a node in Scheduler's state to a particular
|
||||
/// number of bytes used.
|
||||
fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
|
||||
let mut node = Node::new(
|
||||
node_id,
|
||||
"".to_string(),
|
||||
0,
|
||||
"".to_string(),
|
||||
0,
|
||||
scheduler.nodes.get(&node_id).unwrap().az.clone(),
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(test_utilization::simple(
|
||||
shard_count,
|
||||
0,
|
||||
)));
|
||||
scheduler.node_upsert(&node);
|
||||
}
|
||||
|
||||
// Initial empty state. Scores are tied, scheduler prefers lower AZ ID.
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
||||
|
||||
// Home shard count is higher in AZ A, so AZ B will be preferred
|
||||
set_shard_count(&mut scheduler, NodeId(1), 10);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
|
||||
|
||||
// Total home shard count is higher in AZ B, so we revert to preferring AZ A
|
||||
set_shard_count(&mut scheduler, NodeId(4), 6);
|
||||
set_shard_count(&mut scheduler, NodeId(5), 6);
|
||||
// Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
|
||||
set_utilization(&mut scheduler, NodeId(1), 1000000);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
||||
}
|
||||
|
||||
/// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
|
||||
#[test]
|
||||
fn az_selection_many() {
|
||||
let az_a_tag = AvailabilityZone("az-a".to_string());
|
||||
let az_b_tag = AvailabilityZone("az-b".to_string());
|
||||
let az_c_tag = AvailabilityZone("az-c".to_string());
|
||||
let nodes = test_utils::make_test_nodes(
|
||||
6,
|
||||
&[
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_c_tag.clone(),
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_c_tag.clone(),
|
||||
],
|
||||
);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// We should get 1/6th of these on each node, give or take a few...
|
||||
let total_tenants = 300;
|
||||
|
||||
// ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
|
||||
// on one AZ before correcting itself. This is because we select the 'home' AZ based on
|
||||
// an AZ-wide metric, but we select the location for secondaries on a purely node-based
|
||||
// metric (while excluding the home AZ).
|
||||
let grace = 3;
|
||||
|
||||
let mut scheduled_shards = Vec::new();
|
||||
for _i in 0..total_tenants {
|
||||
let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
|
||||
|
||||
let mut node_home_counts = scheduler
|
||||
.nodes
|
||||
.iter()
|
||||
.map(|(node_id, node)| (node_id, node.home_shard_count))
|
||||
.collect::<Vec<_>>();
|
||||
node_home_counts.sort_by_key(|i| i.0);
|
||||
eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::generate(),
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(1),
|
||||
};
|
||||
|
||||
let shard_identity = ShardIdentity::new(
|
||||
tenant_shard_id.shard_number,
|
||||
tenant_shard_id.shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(1),
|
||||
)
|
||||
.unwrap();
|
||||
let mut shard = TenantShard::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
pageserver_api::controller_api::PlacementPolicy::Attached(1),
|
||||
Some(preferred_az),
|
||||
);
|
||||
|
||||
let mut context = ScheduleContext::default();
|
||||
shard.schedule(&mut scheduler, &mut context).unwrap();
|
||||
eprintln!("Scheduled shard at {:?}", shard.intent);
|
||||
|
||||
scheduled_shards.push(shard);
|
||||
}
|
||||
|
||||
for (node_id, node) in &scheduler.nodes {
|
||||
eprintln!(
|
||||
"Node {}: {} {} {}",
|
||||
node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
|
||||
);
|
||||
}
|
||||
|
||||
for node in scheduler.nodes.values() {
|
||||
assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
|
||||
}
|
||||
|
||||
for mut shard in scheduled_shards {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Make sure that when we have an odd number of nodes and an even number of shards, we still
|
||||
/// get scheduling stability.
|
||||
fn odd_nodes_stability() {
|
||||
let az_a = AvailabilityZone("az-a".to_string());
|
||||
let az_b = AvailabilityZone("az-b".to_string());
|
||||
|
||||
let nodes = test_utils::make_test_nodes(
|
||||
10,
|
||||
&[
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
],
|
||||
);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// Need to keep these alive because they contribute to shard counts via RAII
|
||||
let mut scheduled_shards = Vec::new();
|
||||
|
||||
let mut context = ScheduleContext::default();
|
||||
|
||||
fn schedule_shard(
|
||||
tenant_shard_id: TenantShardId,
|
||||
expect_attached: NodeId,
|
||||
expect_secondary: NodeId,
|
||||
scheduled_shards: &mut Vec<TenantShard>,
|
||||
scheduler: &mut Scheduler,
|
||||
preferred_az: Option<AvailabilityZone>,
|
||||
context: &mut ScheduleContext,
|
||||
) {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
tenant_shard_id.shard_number,
|
||||
tenant_shard_id.shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(1),
|
||||
)
|
||||
.unwrap();
|
||||
let mut shard = TenantShard::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
pageserver_api::controller_api::PlacementPolicy::Attached(1),
|
||||
preferred_az,
|
||||
);
|
||||
|
||||
shard.schedule(scheduler, context).unwrap();
|
||||
|
||||
assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
|
||||
assert_eq!(
|
||||
shard.intent.get_secondary().first().unwrap(),
|
||||
&expect_secondary
|
||||
);
|
||||
|
||||
scheduled_shards.push(shard);
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(1),
|
||||
NodeId(6),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(1),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(2),
|
||||
NodeId(7),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(2),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(3),
|
||||
NodeId(8),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(3),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(4),
|
||||
NodeId(9),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(4),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(5),
|
||||
NodeId(10),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(5),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(1),
|
||||
NodeId(6),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(6),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(2),
|
||||
NodeId(7),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(7),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(3),
|
||||
NodeId(8),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
|
||||
for shard in &scheduled_shards {
|
||||
assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
|
||||
}
|
||||
|
||||
for mut shard in scheduled_shards {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
// Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
|
||||
// should prefer the other AZ.
|
||||
set_utilization(&mut scheduler, NodeId(2), 1000000);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ use pageserver_api::{
|
||||
AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
|
||||
NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
|
||||
ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
|
||||
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
|
||||
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
@@ -1404,11 +1404,7 @@ impl Service {
|
||||
|
||||
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
|
||||
// it with what we can infer: the node for which a generation was most recently issued.
|
||||
let mut intent = IntentState::new(
|
||||
tsp.preferred_az_id
|
||||
.as_ref()
|
||||
.map(|az| AvailabilityZone(az.clone())),
|
||||
);
|
||||
let mut intent = IntentState::new();
|
||||
if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
|
||||
{
|
||||
if nodes.contains_key(&generation_pageserver) {
|
||||
@@ -2478,30 +2474,19 @@ impl Service {
|
||||
tenant_id: TenantId,
|
||||
_guard: &TracingExclusiveGuard<TenantOperations>,
|
||||
) -> Result<(), ApiError> {
|
||||
// Check if the tenant is present in memory, and select an AZ to use when loading
|
||||
// if we will load it.
|
||||
let load_in_az = {
|
||||
let present_in_memory = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let existing = locked
|
||||
locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.next();
|
||||
|
||||
// If the tenant is not present in memory, we expect to load it from database,
|
||||
// so let's figure out what AZ to load it into while we have self.inner locked.
|
||||
if existing.is_none() {
|
||||
locked
|
||||
.scheduler
|
||||
.get_az_for_new_tenant()
|
||||
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"No AZ with nodes found to load tenant"
|
||||
)))?
|
||||
} else {
|
||||
// We already have this tenant in memory
|
||||
return Ok(());
|
||||
}
|
||||
.next()
|
||||
.is_some()
|
||||
};
|
||||
|
||||
if present_in_memory {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenant_shards = self.persistence.load_tenant(tenant_id).await?;
|
||||
if tenant_shards.is_empty() {
|
||||
return Err(ApiError::NotFound(
|
||||
@@ -2509,20 +2494,8 @@ impl Service {
|
||||
));
|
||||
}
|
||||
|
||||
// Update the persistent shards with the AZ that we are about to apply to in-memory state
|
||||
self.persistence
|
||||
.set_tenant_shard_preferred_azs(
|
||||
tenant_shards
|
||||
.iter()
|
||||
.map(|t| {
|
||||
(
|
||||
t.get_tenant_shard_id().expect("Corrupt shard in database"),
|
||||
Some(load_in_az.clone()),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
.await?;
|
||||
// TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running
|
||||
// compute, so no benefit to making AZ sticky across detaches.
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
tracing::info!(
|
||||
@@ -2532,7 +2505,7 @@ impl Service {
|
||||
);
|
||||
|
||||
locked.tenants.extend(tenant_shards.into_iter().map(|p| {
|
||||
let intent = IntentState::new(Some(load_in_az.clone()));
|
||||
let intent = IntentState::new();
|
||||
let shard =
|
||||
TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database");
|
||||
|
||||
@@ -4158,42 +4131,17 @@ impl Service {
|
||||
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
|
||||
}
|
||||
|
||||
/// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
|
||||
/// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
|
||||
/// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
|
||||
/// in our external API.
|
||||
pub(crate) fn tenant_list(
|
||||
&self,
|
||||
limit: Option<usize>,
|
||||
start_after: Option<TenantId>,
|
||||
) -> Vec<TenantDescribeResponse> {
|
||||
pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
// Apply start_from parameter
|
||||
let shard_range = match start_after {
|
||||
None => locked.tenants.range(..),
|
||||
Some(tenant_id) => locked.tenants.range(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(u8::MAX),
|
||||
shard_count: ShardCount(u8::MAX),
|
||||
}..,
|
||||
),
|
||||
};
|
||||
|
||||
let mut result = Vec::new();
|
||||
for (_tenant_id, tenant_shards) in &shard_range.group_by(|(id, _shard)| id.tenant_id) {
|
||||
for (_tenant_id, tenant_shards) in
|
||||
&locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
|
||||
{
|
||||
result.push(
|
||||
self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
|
||||
.expect("Groups are always non-empty"),
|
||||
);
|
||||
|
||||
// Enforce `limit` parameter
|
||||
if let Some(limit) = limit {
|
||||
if result.len() >= limit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
@@ -4288,22 +4236,6 @@ impl Service {
|
||||
}
|
||||
|
||||
tracing::info!("Restoring parent shard {tenant_shard_id}");
|
||||
|
||||
// Drop any intents that refer to unavailable nodes, to enable this abort to proceed even
|
||||
// if the original attachment location is offline.
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
if !nodes.get(node_id).unwrap().is_available() {
|
||||
tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}");
|
||||
shard.intent.demote_attached(scheduler, *node_id);
|
||||
}
|
||||
}
|
||||
for node_id in shard.intent.get_secondary().clone() {
|
||||
if !nodes.get(&node_id).unwrap().is_available() {
|
||||
tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}");
|
||||
shard.intent.remove_secondary(scheduler, node_id);
|
||||
}
|
||||
}
|
||||
|
||||
shard.splitting = SplitState::Idle;
|
||||
if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
|
||||
// If this shard can't be scheduled now (perhaps due to offline nodes or
|
||||
@@ -4457,13 +4389,15 @@ impl Service {
|
||||
|
||||
let mut child_state =
|
||||
TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
|
||||
child_state.intent =
|
||||
IntentState::single(scheduler, Some(pageserver), preferred_az.clone());
|
||||
child_state.intent = IntentState::single(scheduler, Some(pageserver));
|
||||
child_state.observed = ObservedState {
|
||||
locations: child_observed,
|
||||
};
|
||||
child_state.generation = Some(generation);
|
||||
child_state.config = config.clone();
|
||||
if let Some(preferred_az) = &preferred_az {
|
||||
child_state.set_preferred_az(preferred_az.clone());
|
||||
}
|
||||
|
||||
// The child's TenantShard::splitting is intentionally left at the default value of Idle,
|
||||
// as at this point in the split process we have succeeded and this part is infallible:
|
||||
@@ -5080,8 +5014,6 @@ impl Service {
|
||||
// If our new attached node was a secondary, it no longer should be.
|
||||
shard.intent.remove_secondary(scheduler, migrate_req.node_id);
|
||||
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
|
||||
// If we were already attached to something, demote that to a secondary
|
||||
if let Some(old_attached) = old_attached {
|
||||
if n > 0 {
|
||||
@@ -5093,6 +5025,8 @@ impl Service {
|
||||
shard.intent.push_secondary(scheduler, old_attached);
|
||||
}
|
||||
}
|
||||
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
}
|
||||
PlacementPolicy::Secondary => {
|
||||
shard.intent.clear(scheduler);
|
||||
@@ -5411,15 +5345,6 @@ impl Service {
|
||||
|
||||
expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
|
||||
|
||||
// Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig`
|
||||
// definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new
|
||||
// fields are added, before doing a comparison.
|
||||
for tsp in &mut persistent_shards {
|
||||
let config: TenantConfig = serde_json::from_str(&tsp.config)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible");
|
||||
}
|
||||
|
||||
if persistent_shards != expect_shards {
|
||||
tracing::error!("Consistency check failed on shards.");
|
||||
|
||||
@@ -5787,7 +5712,7 @@ impl Service {
|
||||
register_req.listen_http_port,
|
||||
register_req.listen_pg_addr,
|
||||
register_req.listen_pg_port,
|
||||
register_req.availability_zone_id.clone(),
|
||||
register_req.availability_zone_id,
|
||||
);
|
||||
|
||||
// TODO: idempotency if the node already exists in the database
|
||||
@@ -5807,9 +5732,8 @@ impl Service {
|
||||
.set(locked.nodes.len() as i64);
|
||||
|
||||
tracing::info!(
|
||||
"Registered pageserver {} ({}), now have {} pageservers",
|
||||
"Registered pageserver {}, now have {} pageservers",
|
||||
register_req.node_id,
|
||||
register_req.availability_zone_id,
|
||||
locked.nodes.len()
|
||||
);
|
||||
Ok(())
|
||||
@@ -6424,7 +6348,7 @@ impl Service {
|
||||
/// available. A return value of 0 indicates that everything is fully reconciled already.
|
||||
fn reconcile_all(&self) -> usize {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
let pageservers = nodes.clone();
|
||||
|
||||
// This function is an efficient place to update lazy statistics, since we are walking
|
||||
@@ -6485,9 +6409,6 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
// Some metrics are calculated from SchedulerNode state, update these periodically
|
||||
scheduler.update_metrics();
|
||||
|
||||
// Process any deferred tenant drops
|
||||
for (tenant_id, guard) in drop_detached_tenants {
|
||||
self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
|
||||
@@ -6546,7 +6467,6 @@ impl Service {
|
||||
// Shard was dropped between planning and execution;
|
||||
continue;
|
||||
};
|
||||
tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
|
||||
if shard.apply_optimization(scheduler, optimization) {
|
||||
optimizations_applied += 1;
|
||||
if self.maybe_reconcile_shard(shard, nodes).is_some() {
|
||||
@@ -6577,13 +6497,7 @@ impl Service {
|
||||
|
||||
let mut work = Vec::new();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (_nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
// We are going to plan a bunch of optimisations before applying any of them, so the
|
||||
// utilisation stats on nodes will be effectively stale for the >1st optimisation we
|
||||
// generate. To avoid this causing unstable migrations/flapping, it's important that the
|
||||
// code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`]
|
||||
// to ignore the utilisation component of the score.
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
for (_tenant_id, schedule_context, shards) in
|
||||
TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
|
||||
@@ -6614,28 +6528,13 @@ impl Service {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Fast path: we may quickly identify shards that don't have any possible optimisations
|
||||
if !shard.maybe_optimizable(scheduler, &schedule_context) {
|
||||
if cfg!(feature = "testing") {
|
||||
// Check that maybe_optimizable doesn't disagree with the actual optimization functions.
|
||||
// Only do this in testing builds because it is not a correctness-critical check, so we shouldn't
|
||||
// panic in prod if we hit this, or spend cycles on it in prod.
|
||||
assert!(shard
|
||||
.optimize_attachment(scheduler, &schedule_context)
|
||||
.is_none());
|
||||
assert!(shard
|
||||
.optimize_secondary(scheduler, &schedule_context)
|
||||
.is_none());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: optimization calculations are relatively expensive: create some fast-path for
|
||||
// the common idle case (avoiding the search on tenants that we have recently checked)
|
||||
if let Some(optimization) =
|
||||
// If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to
|
||||
// If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
|
||||
// its primary location based on soft constraints, cut it over.
|
||||
shard.optimize_attachment(scheduler, &schedule_context)
|
||||
shard.optimize_attachment(nodes, &schedule_context)
|
||||
{
|
||||
tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}");
|
||||
work.push((shard.tenant_shard_id, optimization));
|
||||
break;
|
||||
} else if let Some(optimization) =
|
||||
@@ -6645,7 +6544,6 @@ impl Service {
|
||||
// in the same tenant with secondary locations on the node where they originally split.
|
||||
shard.optimize_secondary(scheduler, &schedule_context)
|
||||
{
|
||||
tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}");
|
||||
work.push((shard.tenant_shard_id, optimization));
|
||||
break;
|
||||
}
|
||||
@@ -6694,10 +6592,8 @@ impl Service {
|
||||
}
|
||||
}
|
||||
}
|
||||
ScheduleOptimizationAction::ReplaceSecondary(_)
|
||||
| ScheduleOptimizationAction::CreateSecondary(_)
|
||||
| ScheduleOptimizationAction::RemoveSecondary(_) => {
|
||||
// No extra checks needed to manage secondaries: this does not interrupt client access
|
||||
ScheduleOptimizationAction::ReplaceSecondary(_) => {
|
||||
// No extra checks needed to replace a secondary: this does not interrupt client access
|
||||
validated_work.push((tenant_shard_id, optimization))
|
||||
}
|
||||
};
|
||||
@@ -6769,35 +6665,26 @@ impl Service {
|
||||
/// we have this helper to move things along faster.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
|
||||
let (attached_node, secondaries) = {
|
||||
let (attached_node, secondary_node) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
|
||||
tracing::warn!(
|
||||
"Skipping kick of secondary download for {tenant_shard_id}: not found"
|
||||
);
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(attached) = shard.intent.get_attached() else {
|
||||
tracing::warn!(
|
||||
"Skipping kick of secondary download for {tenant_shard_id}: no attached"
|
||||
);
|
||||
let (Some(attached), Some(secondary)) = (
|
||||
shard.intent.get_attached(),
|
||||
shard.intent.get_secondary().first(),
|
||||
) else {
|
||||
return;
|
||||
};
|
||||
|
||||
let secondaries = shard
|
||||
.intent
|
||||
.get_secondary()
|
||||
.iter()
|
||||
.map(|n| locked.nodes.get(n).unwrap().clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
(locked.nodes.get(attached).unwrap().clone(), secondaries)
|
||||
(
|
||||
locked.nodes.get(attached).unwrap().clone(),
|
||||
locked.nodes.get(secondary).unwrap().clone(),
|
||||
)
|
||||
};
|
||||
|
||||
// Make remote API calls to upload + download heatmaps: we ignore errors because this is just
|
||||
// a 'kick' to let scheduling optimisation run more promptly.
|
||||
match attached_node
|
||||
attached_node
|
||||
.with_client_retries(
|
||||
|client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
|
||||
&self.config.jwt_token,
|
||||
@@ -6806,57 +6693,22 @@ impl Service {
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Err(e)) => {
|
||||
tracing::info!(
|
||||
"Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}"
|
||||
);
|
||||
}
|
||||
None => {
|
||||
tracing::info!(
|
||||
"Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}"
|
||||
);
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
tracing::info!(
|
||||
"Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}"
|
||||
);
|
||||
}
|
||||
}
|
||||
.await;
|
||||
|
||||
for secondary_node in secondaries {
|
||||
match secondary_node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(
|
||||
tenant_shard_id,
|
||||
Some(Duration::from_secs(1)),
|
||||
)
|
||||
.await
|
||||
},
|
||||
&self.config.jwt_token,
|
||||
3,
|
||||
10,
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Err(e)) => {
|
||||
tracing::info!(
|
||||
"Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
|
||||
);
|
||||
}
|
||||
None => {
|
||||
tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}");
|
||||
}
|
||||
Some(Ok(progress)) => {
|
||||
tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
secondary_node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
|
||||
.await
|
||||
},
|
||||
&self.config.jwt_token,
|
||||
3,
|
||||
10,
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Look for shards which are oversized and in need of splitting
|
||||
@@ -7279,95 +7131,49 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a node fill plan (pick secondaries to promote), based on:
|
||||
/// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
|
||||
/// outside their home AZ, should be migrated back here.
|
||||
/// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of
|
||||
/// attached shards, we will promote more shards from the nodes with the most attached shards, unless
|
||||
/// those shards have a home AZ that doesn't match the node we're filling.
|
||||
/// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
|
||||
/// 1. The node should be filled until it reaches the expected cluster average of
|
||||
/// attached shards. If there are not enough secondaries on the node, the plan stops early.
|
||||
/// 2. Select tenant shards to promote such that the number of attached shards is balanced
|
||||
/// throughout the cluster. We achieve this by picking tenant shards from each node,
|
||||
/// starting from the ones with the largest number of attached shards, until the node
|
||||
/// reaches the expected cluster average.
|
||||
/// 3. Avoid promoting more shards of the same tenant than required. The upper bound
|
||||
/// for the number of tenants from the same shard promoted to the node being filled is:
|
||||
/// shard count for the tenant divided by the number of nodes in the cluster.
|
||||
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
|
||||
let node_az = nodes
|
||||
.get(&node_id)
|
||||
.expect("Node must exist")
|
||||
.get_availability_zone_id()
|
||||
.clone();
|
||||
|
||||
// The tenant shard IDs that we plan to promote from secondary to attached on this node
|
||||
let mut plan = Vec::new();
|
||||
|
||||
// Collect shards which do not have a preferred AZ & are elegible for moving in stage 2
|
||||
let mut free_tids_by_node: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
|
||||
// Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could
|
||||
// conceivably come up in real life if deploying a single-AZ region intentionally.
|
||||
let respect_azs = nodes
|
||||
.values()
|
||||
.map(|n| n.get_availability_zone_id())
|
||||
.unique()
|
||||
.count()
|
||||
> 1;
|
||||
|
||||
// Step 1: collect all shards that we are required to migrate back to this node because their AZ preference
|
||||
// requires it.
|
||||
for (tsid, tenant_shard) in tenants {
|
||||
if !tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
// Shard doesn't have a secondary on this node, ignore it.
|
||||
continue;
|
||||
}
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
_ if !respect_azs => {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved, but we
|
||||
// will only do so if our target shard count requires it.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it should
|
||||
// be moved back to this node as part of filling, unless its currently
|
||||
// attached location is also in its home AZ.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
if nodes
|
||||
.get(primary)
|
||||
.expect("referenced node must exist")
|
||||
.get_availability_zone_id()
|
||||
!= tenant_shard
|
||||
.preferred_az()
|
||||
.expect("tenant must have an AZ preference")
|
||||
{
|
||||
plan.push(*tsid)
|
||||
}
|
||||
} else {
|
||||
plan.push(*tsid)
|
||||
}
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling,
|
||||
// it may not be moved back to this node as part of filling. Ignore it
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
|
||||
let mut tids_by_node = locked
|
||||
.tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(tid, tenant_shard)| {
|
||||
if !matches!(
|
||||
tenant_shard.get_scheduling_policy(),
|
||||
ShardSchedulingPolicy::Active
|
||||
) {
|
||||
// Only include tenants in fills if they have a normal (Active) scheduling policy. We
|
||||
// even exclude Essential, because moving to fill a node is not essential to keeping this
|
||||
// tenant available.
|
||||
return None;
|
||||
}
|
||||
|
||||
if tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
return Some((*primary, *tid));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
})
|
||||
.into_group_map();
|
||||
|
||||
let expected_attached = locked.scheduler.expected_attached_shard_count();
|
||||
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
|
||||
|
||||
let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
|
||||
let mut plan = Vec::new();
|
||||
|
||||
for (node_id, attached) in nodes_by_load {
|
||||
let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
|
||||
@@ -7376,7 +7182,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if plan.len() >= fill_requirement
|
||||
|| free_tids_by_node.is_empty()
|
||||
|| tids_by_node.is_empty()
|
||||
|| attached <= expected_attached
|
||||
{
|
||||
break;
|
||||
@@ -7388,7 +7194,7 @@ impl Service {
|
||||
|
||||
let mut remove_node = false;
|
||||
while take > 0 {
|
||||
match free_tids_by_node.get_mut(&node_id) {
|
||||
match tids_by_node.get_mut(&node_id) {
|
||||
Some(tids) => match tids.pop() {
|
||||
Some(tid) => {
|
||||
let max_promote_for_tenant = std::cmp::max(
|
||||
@@ -7414,7 +7220,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if remove_node {
|
||||
free_tids_by_node.remove(&node_id);
|
||||
tids_by_node.remove(&node_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7681,16 +7487,6 @@ impl Service {
|
||||
self.persistence.safekeeper_upsert(record).await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id: i64,
|
||||
scheduling_policy: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
self.persistence
|
||||
.set_safekeeper_scheduling_policy(id, scheduling_policy)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn update_shards_preferred_azs(
|
||||
&self,
|
||||
req: ShardsPreferredAzsRequest,
|
||||
|
||||
@@ -43,6 +43,9 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
|
||||
|
||||
// Accumulate the schedule context for all the shards in a tenant
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
tenant_shards.push(shard);
|
||||
|
||||
if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
|
||||
@@ -112,7 +115,7 @@ mod tests {
|
||||
assert_eq!(tenant_id, t1_id);
|
||||
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
|
||||
assert_eq!(shards.len(), 1);
|
||||
assert_eq!(context.location_count(), 2);
|
||||
assert_eq!(context.attach_count(), 1);
|
||||
|
||||
let (tenant_id, context, shards) = iter.next().unwrap();
|
||||
assert_eq!(tenant_id, t2_id);
|
||||
@@ -121,13 +124,13 @@ mod tests {
|
||||
assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
|
||||
assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
|
||||
assert_eq!(shards.len(), 4);
|
||||
assert_eq!(context.location_count(), 8);
|
||||
assert_eq!(context.attach_count(), 4);
|
||||
|
||||
let (tenant_id, context, shards) = iter.next().unwrap();
|
||||
assert_eq!(tenant_id, t3_id);
|
||||
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
|
||||
assert_eq!(shards.len(), 1);
|
||||
assert_eq!(context.location_count(), 2);
|
||||
assert_eq!(context.attach_count(), 1);
|
||||
|
||||
for shard in tenants.values_mut() {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -370,7 +370,6 @@ class NeonEnvBuilder:
|
||||
pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None,
|
||||
num_safekeepers: int = 1,
|
||||
num_pageservers: int = 1,
|
||||
num_azs: int = 1,
|
||||
# Use non-standard SK ids to check for various parsing bugs
|
||||
safekeepers_id_start: int = 0,
|
||||
# fsync is disabled by default to make the tests go faster
|
||||
@@ -402,7 +401,6 @@ class NeonEnvBuilder:
|
||||
self.pageserver_config_override = pageserver_config_override
|
||||
self.num_safekeepers = num_safekeepers
|
||||
self.num_pageservers = num_pageservers
|
||||
self.num_azs = num_azs
|
||||
self.safekeepers_id_start = safekeepers_id_start
|
||||
self.safekeepers_enable_fsync = safekeepers_enable_fsync
|
||||
self.auth_enabled = auth_enabled
|
||||
@@ -992,7 +990,6 @@ class NeonEnv:
|
||||
self.endpoints = EndpointFactory(self)
|
||||
self.safekeepers: list[Safekeeper] = []
|
||||
self.pageservers: list[NeonPageserver] = []
|
||||
self.num_azs = config.num_azs
|
||||
self.broker = NeonBroker(self)
|
||||
self.pageserver_remote_storage = config.pageserver_remote_storage
|
||||
self.safekeepers_remote_storage = config.safekeepers_remote_storage
|
||||
@@ -1093,21 +1090,14 @@ class NeonEnv:
|
||||
http=self.port_distributor.get_port(),
|
||||
)
|
||||
|
||||
# Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override`
|
||||
if self.num_azs > 1:
|
||||
# Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc.
|
||||
az_prefix = DEFAULT_AZ_ID[:-1]
|
||||
availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}"
|
||||
else:
|
||||
availability_zone = DEFAULT_AZ_ID
|
||||
|
||||
ps_cfg: dict[str, Any] = {
|
||||
"id": ps_id,
|
||||
"listen_pg_addr": f"localhost:{pageserver_port.pg}",
|
||||
"listen_http_addr": f"localhost:{pageserver_port.http}",
|
||||
"pg_auth_type": pg_auth_type,
|
||||
"http_auth_type": http_auth_type,
|
||||
"availability_zone": availability_zone,
|
||||
# Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
|
||||
"availability_zone": DEFAULT_AZ_ID,
|
||||
# Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
|
||||
# the pageserver taking a long time to start up due to syncfs flushing other tests' data
|
||||
"no_sync": True,
|
||||
@@ -1894,10 +1884,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def tenant_shard_dump(self):
|
||||
"""
|
||||
Debug listing API: dumps the internal map of tenant shards
|
||||
"""
|
||||
def tenant_list(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/debug/v1/tenant",
|
||||
@@ -1905,18 +1892,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def tenant_list(self, **kwargs):
|
||||
"""
|
||||
Control API tenant listing: a vector of the same content returned by tenant_describe
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/tenant",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
params=kwargs,
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def node_configure(self, node_id, body: dict[str, Any]):
|
||||
log.info(f"node_configure({node_id}, {body})")
|
||||
body["node_id"] = node_id
|
||||
@@ -2263,7 +2238,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"""
|
||||
Get the intent and observed placements of all tenants known to the storage controller.
|
||||
"""
|
||||
tenants = self.tenant_shard_dump()
|
||||
tenants = self.tenant_list()
|
||||
|
||||
tenant_placement: defaultdict[str, dict[str, Any]] = defaultdict(
|
||||
lambda: {
|
||||
@@ -2346,14 +2321,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
json=body,
|
||||
)
|
||||
|
||||
def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
json={"id": id, "scheduling_policy": scheduling_policy},
|
||||
)
|
||||
|
||||
def get_safekeeper(self, id: int) -> dict[str, Any] | None:
|
||||
try:
|
||||
response = self.request(
|
||||
@@ -4153,7 +4120,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
|
||||
# Checkpoints running endpoint and returns pg_wal size in MB.
|
||||
def get_pg_wal_size(self):
|
||||
log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}")
|
||||
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
|
||||
self.safe_psql("checkpoint")
|
||||
assert self.pgdata_dir is not None # please mypy
|
||||
return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
|
||||
@@ -4993,7 +4960,7 @@ def logical_replication_sync(
|
||||
if res:
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
subscriber_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
|
||||
if subscriber_lsn >= publisher_lsn:
|
||||
return subscriber_lsn
|
||||
time.sleep(0.5)
|
||||
|
||||
@@ -15,6 +15,7 @@ from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures.common_types import (
|
||||
Id,
|
||||
Lsn,
|
||||
TenantId,
|
||||
TenantShardId,
|
||||
@@ -24,7 +25,7 @@ from fixtures.common_types import (
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import EnhancedJSONEncoder, Fn
|
||||
from fixtures.utils import Fn
|
||||
|
||||
|
||||
class PageserverApiException(Exception):
|
||||
@@ -82,6 +83,14 @@ class TimelineCreateRequest:
|
||||
mode: TimelineCreateRequestMode
|
||||
|
||||
def to_json(self) -> str:
|
||||
class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if dataclasses.is_dataclass(o) and not isinstance(o, type):
|
||||
return dataclasses.asdict(o)
|
||||
elif isinstance(o, Id):
|
||||
return o.id.hex()
|
||||
return super().default(o)
|
||||
|
||||
# mode is flattened
|
||||
this = dataclasses.asdict(self)
|
||||
mode = this.pop("mode")
|
||||
|
||||
@@ -10,7 +10,7 @@ import requests
|
||||
from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.utils import EnhancedJSONEncoder, wait_until
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
@@ -25,7 +25,6 @@ class Walreceiver:
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
mconf: Configuration | None
|
||||
term: int
|
||||
last_log_term: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
@@ -70,56 +69,6 @@ class TermBumpResponse:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperId:
|
||||
id: int
|
||||
host: str
|
||||
pg_port: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class Configuration:
|
||||
generation: int
|
||||
members: list[SafekeeperId]
|
||||
new_members: list[SafekeeperId] | None
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> Configuration:
|
||||
generation = d["generation"]
|
||||
members = d["members"]
|
||||
new_members = d.get("new_members")
|
||||
return Configuration(generation, members, new_members)
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self, cls=EnhancedJSONEncoder)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineCreateRequest:
|
||||
tenant_id: TenantId
|
||||
timeline_id: TimelineId
|
||||
mconf: Configuration
|
||||
# not exactly PgVersion, for example 150002 for 15.2
|
||||
pg_version: int
|
||||
start_lsn: Lsn
|
||||
commit_lsn: Lsn | None
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self, cls=EnhancedJSONEncoder)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineMembershipSwitchResponse:
|
||||
previous_conf: Configuration
|
||||
current_conf: Configuration
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
|
||||
previous_conf = Configuration.from_json(d["previous_conf"])
|
||||
current_conf = Configuration.from_json(d["current_conf"])
|
||||
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
@@ -182,8 +131,20 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
resj = res.json()
|
||||
return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
|
||||
|
||||
def timeline_create(self, r: TimelineCreateRequest):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", data=r.to_json())
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
commit_lsn: Lsn,
|
||||
):
|
||||
body = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"pg_version": pg_version,
|
||||
"commit_lsn": str(commit_lsn),
|
||||
}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_status(
|
||||
@@ -193,10 +154,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
# It is always normally not None, it is allowed only to make forward compat tests happy.
|
||||
mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
|
||||
return SafekeeperTimelineStatus(
|
||||
mconf=mconf,
|
||||
term=resj["acceptor_state"]["term"],
|
||||
last_log_term=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
@@ -222,11 +180,6 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
return self.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
|
||||
# Get timeline membership configuration.
|
||||
def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
|
||||
# make mypy happy
|
||||
return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
@@ -273,16 +226,6 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def membership_switch(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
|
||||
) -> TimelineMembershipSwitchResponse:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
|
||||
data=to.to_json(),
|
||||
)
|
||||
res.raise_for_status()
|
||||
return TimelineMembershipSwitchResponse.from_json(res.json())
|
||||
|
||||
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: dict[str, Any]):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -22,7 +21,6 @@ import zstandard
|
||||
from psycopg2.extensions import cursor
|
||||
from typing_extensions import override
|
||||
|
||||
from fixtures.common_types import Id, Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.common_types import (
|
||||
parse_delta_layer,
|
||||
@@ -607,22 +605,6 @@ class PropagatingThread(threading.Thread):
|
||||
return self.ret
|
||||
|
||||
|
||||
class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
"""
|
||||
Default json.JSONEncoder works only on primitive builtins. Extend it to any
|
||||
dataclass plus our custom types.
|
||||
"""
|
||||
|
||||
def default(self, o):
|
||||
if dataclasses.is_dataclass(o) and not isinstance(o, type):
|
||||
return dataclasses.asdict(o)
|
||||
elif isinstance(o, Id):
|
||||
return o.id.hex()
|
||||
elif isinstance(o, Lsn):
|
||||
return str(o) # standard hex notation
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def human_bytes(amt: float) -> str:
|
||||
"""
|
||||
Render a bytes amount into nice IEC bytes string.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user