Compare commits

..

17 Commits

Author SHA1 Message Date
Arseny Sher
c811ae0b91 convert appendresponse 2025-02-04 10:02:43 +01:00
Arseny Sher
777afbafe5 convert appendrequest 2025-01-30 13:21:40 +01:00
Arseny Sher
0d0cd16ea2 convert ProposerElected 2025-01-30 12:23:27 +01:00
Arseny Sher
217309c7ef remove propelected.timeline_start_lsn usages in sk 2025-01-30 11:29:14 +01:00
Arseny Sher
976afcee26 Convert VoteResponse, remove timeline_start_lsn. 2025-01-30 11:10:58 +01:00
Arseny Sher
20e974ecdd convert voterequest 2025-01-29 12:31:22 +01:00
Arseny Sher
eb43f65055 move greeting init v2 to PAMessageSerialize 2025-01-29 10:37:02 +01:00
Arseny Sher
96d67abd50 wp: print mconf 2025-01-28 13:42:53 +01:00
Arseny Sher
f7485c4459 convert acceptorgreeting 2025-01-23 10:26:13 +01:00
Arseny Sher
4ea7b22537 rename sk members to m for brevity 2025-01-22 13:52:30 +01:00
Arseny Sher
10a7878230 infra for v3 acceptor -> proposer msgs 2025-01-22 12:54:50 +01:00
Arseny Sher
c19a8b69f2 convert ProposerGreeting 2025-01-22 12:07:21 +01:00
Arseny Sher
8be17724d8 Add two safekeeper proto versions to test_normal_work. 2025-01-20 11:22:14 +01:00
Arseny Sher
234c3a29df Pass proto selection to safekeeper, prepare parsing v3. 2025-01-20 11:18:26 +01:00
Arseny Sher
db5513076a Add PAMessageSerialize and ProposerAcceptorGreeting v3 2025-01-15 16:29:20 +01:00
Arseny Sher
70d4e077a6 pgindent wp code 2025-01-15 16:29:20 +01:00
Arseny Sher
ae9db8975a Add START_WAL_PUSH proto_version and allow_timeline_creation options. 2025-01-15 16:29:20 +01:00
93 changed files with 2324 additions and 4816 deletions

View File

@@ -11,7 +11,7 @@ rustdocflags = ["-Arustdoc::private_intra_doc_links"]
# * <https://github.com/rust-lang/rust/pull/122646>
#
# NB: the RUSTFLAGS envvar will replace this. Make sure to update e.g. Dockerfile as well.
rustflags = ["-Cforce-frame-pointers=yes", "--cfg", "tokio_unstable", "--cfg", "tokio_taskdump"]
rustflags = ["-Cforce-frame-pointers=yes"]
[alias]
build_testing = ["build", "--features", "testing"]

View File

@@ -1,91 +0,0 @@
name: Check Codestyle Rust
on:
workflow_call:
inputs:
build-tools-image:
description: "build-tools image"
required: true
type: string
archs:
description: "Json array of architectures to run on"
type: string
defaults:
run:
shell: bash -euxo pipefail {0}
jobs:
check-codestyle-rust:
strategy:
matrix:
arch: ${{ fromJson(inputs.archs) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
container:
image: ${{ inputs.build-tools-image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Cache cargo deps
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
# This will catch compiler & clippy warnings in all feature combinations.
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
# NB: keep clippy args in sync with ./run_clippy.sh
#
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
# time just for that, so skip "clippy --release".
- run: |
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
echo "No clippy args found in .neon_clippy_args"
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() }}
run: cargo deny check --hide-inclusion-graph

View File

@@ -164,11 +164,77 @@ jobs:
check-codestyle-rust:
needs: [ check-permissions, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-rust.yml
with:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
archs: '["x64", "arm64"]'
secrets: inherit
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Cache cargo deps
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
# This will catch compiler & clippy warnings in all feature combinations.
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
# NB: keep clippy args in sync with ./run_clippy.sh
#
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
# time just for that, so skip "clippy --release".
- run: |
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
echo "No clippy args found in .neon_clippy_args"
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() }}
run: cargo deny check --hide-inclusion-graph
build-and-test-locally:
needs: [ tag, build-build-tools-image ]
@@ -824,7 +890,7 @@ jobs:
docker compose -f ./docker-compose/docker-compose.yml down
promote-images-dev:
needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
needs: [ check-permissions, tag, vm-compute-node-image ]
runs-on: ubuntu-22.04
permissions:

View File

@@ -1,12 +1,6 @@
name: Pre-merge checks
on:
pull_request:
paths:
- .github/workflows/_check-codestyle-python.yml
- .github/workflows/_check-codestyle-rust.yml
- .github/workflows/build-build-tools-image.yml
- .github/workflows/pre-merge-checks.yml
merge_group:
branches:
- main
@@ -23,10 +17,8 @@ jobs:
runs-on: ubuntu-22.04
outputs:
python-changed: ${{ steps.python-src.outputs.any_changed }}
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
steps:
- uses: actions/checkout@v4
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: python-src
with:
@@ -38,25 +30,11 @@ jobs:
poetry.lock
pyproject.toml
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: rust-src
with:
files: |
.github/workflows/_check-codestyle-rust.yml
.github/workflows/build-build-tools-image.yml
.github/workflows/pre-merge-checks.yml
**/**.rs
**/Cargo.toml
Cargo.toml
Cargo.lock
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
env:
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
run: |
echo "${PYTHON_CHANGED_FILES}"
echo "${RUST_CHANGED_FILES}"
build-build-tools-image:
if: needs.get-changed-files.outputs.python-changed == 'true'
@@ -77,16 +55,6 @@ jobs:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
secrets: inherit
check-codestyle-rust:
if: needs.get-changed-files.outputs.rust-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-rust.yml
with:
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
archs: '["x64"]'
secrets: inherit
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
# Currently we require 2 jobs (checks with exact name):
# - conclusion
@@ -99,7 +67,6 @@ jobs:
needs:
- get-changed-files
- check-codestyle-python
- check-codestyle-rust
runs-on: ubuntu-22.04
steps:
- name: Create fake `neon-cloud-e2e` check

7
Cargo.lock generated
View File

@@ -1312,7 +1312,6 @@ dependencies = [
"tracing-utils",
"url",
"utils",
"uuid",
"vm_monitor",
"workspace_hack",
"zstd",
@@ -3982,11 +3981,9 @@ name = "pagectl"
version = "0.1.0"
dependencies = [
"anyhow",
"bincode",
"camino",
"clap",
"humantime",
"itertools 0.10.5",
"pageserver",
"pageserver_api",
"postgres_ffi",
@@ -4008,7 +4005,6 @@ dependencies = [
"arc-swap",
"async-compression",
"async-stream",
"bincode",
"bit_field",
"byteorder",
"bytes",
@@ -5659,7 +5655,6 @@ dependencies = [
"crc32c",
"criterion",
"desim",
"env_logger 0.10.2",
"fail",
"futures",
"hex",
@@ -5688,7 +5683,6 @@ dependencies = [
"serde",
"serde_json",
"sha2",
"smallvec",
"storage_broker",
"strum",
"strum_macros",
@@ -5715,7 +5709,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"const_format",
"pageserver_api",
"postgres_ffi",
"pq_proto",
"serde",

View File

@@ -66,7 +66,6 @@ RUN cd postgres && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
# Enable some of contrib extensions
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
@@ -872,7 +871,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
cargo install --locked --version 0.12.9 cargo-pgrx && \
cargo install --locked --version 0.12.6 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
@@ -909,19 +908,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
\
cd exts/rag && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
\
cd ../rag_bge_small_en_v15 && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
cargo pgrx install --release --features remote_onnx && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
\
cd ../rag_jina_reranker_v1_tiny_en && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
cargo pgrx install --release --features remote_onnx && \
@@ -946,8 +945,7 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
# against postgres forks that decided to change their ABI name (like us).
# With that we can build extensions without forking them and using stock
# pgx. As this feature is new few manual version bumps were required.
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -965,8 +963,7 @@ ARG PG_VERSION
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
# it's needed to enable extension because it uses untrusted C language
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -987,58 +984,33 @@ ARG PG_VERSION
RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
# TODO update pgrx version in the pg_tiktoken repo and remove this line
sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
#
# Layer "pg-pgx-ulid-build"
# Compile "pgx_ulid" extension for v16 and below
# Compile "pgx_ulid" extension
#
#########################################################################################
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16") \
;; \
*) \
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
;; \
# doesn't support v17 yet
# https://github.com/pksunkara/pgx_ulid/pull/52
RUN case "${PG_VERSION}" in "v17") \
echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
esac && \
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################
#
# Layer "pg-pgx-ulid-pgrx12-build"
# Compile "pgx_ulid" extension for v17 and up
#
#########################################################################################
FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
ARG PG_VERSION
RUN case "${PG_VERSION}" in \
"v17") \
;; \
*) \
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
;; \
esac && \
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################
#
@@ -1056,11 +1028,7 @@ ARG PG_VERSION
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release
#########################################################################################
@@ -1183,7 +1151,6 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/

View File

@@ -51,7 +51,6 @@ tracing-subscriber.workspace = true
tracing-utils.workspace = true
thiserror.workspace = true
url.workspace = true
uuid.workspace = true
prometheus.workspace = true
postgres_initdb.workspace = true

View File

@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use clap::Parser;
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
use nix::unistd::Pid;
use tracing::{error, info, info_span, warn, Instrument};
use tracing::{info, info_span, warn, Instrument};
use utils::fs_ext::is_directory_empty;
#[path = "fast_import/aws_s3_sync.rs"]
@@ -41,19 +41,12 @@ mod child_stdio_to_log;
#[path = "fast_import/s3_uri.rs"]
mod s3_uri;
const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
#[derive(clap::Parser)]
struct Args {
#[clap(long)]
working_directory: Utf8PathBuf,
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
s3_prefix: Option<s3_uri::S3Uri>,
#[clap(long)]
source_connection_string: Option<String>,
#[clap(short, long)]
interactive: bool,
s3_prefix: s3_uri::S3Uri,
#[clap(long)]
pg_bin_dir: Utf8PathBuf,
#[clap(long)]
@@ -84,70 +77,30 @@ pub(crate) async fn main() -> anyhow::Result<()> {
info!("starting");
let args = Args::parse();
let Args {
working_directory,
s3_prefix,
pg_bin_dir,
pg_lib_dir,
} = Args::parse();
// Validate arguments
if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
anyhow::bail!("either s3_prefix or source_connection_string must be specified");
}
if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
}
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let working_directory = args.working_directory;
let pg_bin_dir = args.pg_bin_dir;
let pg_lib_dir = args.pg_lib_dir;
// Initialize AWS clients only if s3_prefix is specified
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let kms = aws_sdk_kms::Client::new(&config);
(Some(config), Some(kms))
} else {
(None, None)
};
// Get source connection string either from S3 spec or direct argument
let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
let spec: Spec = {
let spec_key = s3_prefix.append("/spec.json");
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
let object = s3_client
.get_object()
.bucket(&spec_key.bucket)
.key(spec_key.key)
.send()
.await
.context("get spec from s3")?
.body
.collect()
.await
.context("download spec body")?;
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
};
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
let mut output = kms_client
.unwrap()
.decrypt()
.key_id(key_id)
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
spec.source_connstring_ciphertext_base64,
))
.send()
.await
.context("decrypt source connection string")?;
let plaintext = output
.plaintext
.take()
.context("get plaintext source connection string")?;
String::from_utf8(plaintext.into_inner())
.context("parse source connection string as utf8")?
}
}
} else {
args.source_connection_string.unwrap()
let spec: Spec = {
let spec_key = s3_prefix.append("/spec.json");
let s3_client = aws_sdk_s3::Client::new(&aws_config);
let object = s3_client
.get_object()
.bucket(&spec_key.bucket)
.key(spec_key.key)
.send()
.await
.context("get spec from s3")?
.body
.collect()
.await
.context("download spec body")?;
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
};
match tokio::fs::create_dir(&working_directory).await {
@@ -170,6 +123,15 @@ pub(crate) async fn main() -> anyhow::Result<()> {
.await
.context("create pgdata directory")?;
//
// Setup clients
//
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let kms_client = aws_sdk_kms::Client::new(&aws_config);
//
// Initialize pgdata
//
let pgbin = pg_bin_dir.join("postgres");
let pg_version = match get_pg_version(pgbin.as_ref()) {
PostgresMajorVersion::V14 => 14,
@@ -208,13 +170,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
.args(["-c", &format!("max_parallel_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
.args(["-c", &format!("max_worker_processes={nproc}")])
.args([
"-c",
&format!(
"effective_io_concurrency={}",
if cfg!(target_os = "macos") { 0 } else { 100 }
),
])
.args(["-c", "effective_io_concurrency=100"])
.env_clear()
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
@@ -229,58 +185,44 @@ pub(crate) async fn main() -> anyhow::Result<()> {
)
.instrument(info_span!("postgres")),
);
// Create neondb database in the running postgres
let restore_pg_connstring =
format!("host=localhost port=5432 user={superuser} dbname=postgres");
let start_time = std::time::Instant::now();
loop {
if start_time.elapsed() > PG_WAIT_TIMEOUT {
error!(
"timeout exceeded: failed to poll postgres and create database within 10 minutes"
);
std::process::exit(1);
}
match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
Ok((client, connection)) => {
// Spawn the connection handling task to maintain the connection
tokio::spawn(async move {
if let Err(e) = connection.await {
warn!("connection error: {}", e);
}
});
match client.simple_query("CREATE DATABASE neondb;").await {
Ok(_) => {
info!("created neondb database");
break;
}
Err(e) => {
warn!(
"failed to create database: {}, retying in {}s",
e,
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
);
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
continue;
}
}
}
Err(_) => {
info!(
"postgres not ready yet, retrying in {}s",
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
);
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
continue;
}
let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
if res.is_ok() {
info!("postgres is ready, could connect to it");
break;
}
}
let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
//
// Decrypt connection string
//
let source_connection_string = {
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
let mut output = kms_client
.decrypt()
.key_id(key_id)
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
spec.source_connstring_ciphertext_base64,
))
.send()
.await
.context("decrypt source connection string")?;
let plaintext = output
.plaintext
.take()
.context("get plaintext source connection string")?;
String::from_utf8(plaintext.into_inner())
.context("parse source connection string as utf8")?
}
}
};
//
// Start the work
//
let dumpdir = working_directory.join("dumpdir");
@@ -368,12 +310,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
}
}
// If interactive mode, wait for Ctrl+C
if args.interactive {
info!("Running in interactive mode. Press Ctrl+C to shut down.");
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
}
info!("shutdown postgres");
{
nix::sys::signal::kill(
@@ -389,24 +325,21 @@ pub(crate) async fn main() -> anyhow::Result<()> {
.context("wait for postgres to shut down")?;
}
// Only sync if s3_prefix was specified
if let Some(s3_prefix) = args.s3_prefix {
info!("upload pgdata");
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
.await
.context("sync dump directory to destination")?;
info!("upload pgdata");
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
.await
.context("sync dump directory to destination")?;
info!("write status");
{
let status_dir = working_directory.join("status");
std::fs::create_dir(&status_dir).context("create status directory")?;
let status_file = status_dir.join("pgdata");
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
.context("write status file")?;
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
.await
.context("sync status directory to destination")?;
}
info!("write status");
{
let status_dir = working_directory.join("status");
std::fs::create_dir(&status_dir).context("create status directory")?;
let status_file = status_dir.join("pgdata");
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
.context("write status file")?;
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
.await
.context("sync status directory to destination")?;
}
Ok(())

View File

@@ -17,8 +17,7 @@ use crate::{
#[derive(Debug, Clone, Deserialize)]
pub(in crate::http) struct ExtensionServerParams {
#[serde(default)]
is_library: bool,
is_library: Option<bool>,
}
/// Download a remote extension.
@@ -52,7 +51,7 @@ pub(in crate::http) async fn download_extension(
remote_extensions.get_ext(
&filename,
params.is_library,
params.is_library.unwrap_or(false),
&compute.build_tag,
&compute.pgversion,
)

View File

@@ -1,14 +1,15 @@
use std::{
net::{IpAddr, Ipv6Addr, SocketAddr},
sync::Arc,
sync::{
atomic::{AtomicU64, Ordering},
Arc,
},
thread,
time::Duration,
};
use anyhow::Result;
use axum::{
extract::Request,
middleware::{self, Next},
response::{IntoResponse, Response},
routing::{get, post},
Router,
@@ -16,9 +17,11 @@ use axum::{
use http::StatusCode;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
use tower_http::{
request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
trace::TraceLayer,
};
use tracing::{debug, error, info, Span};
use uuid::Uuid;
use super::routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
@@ -31,24 +34,30 @@ async fn handle_404() -> Response {
StatusCode::NOT_FOUND.into_response()
}
const X_REQUEST_ID: &str = "x-request-id";
#[derive(Clone, Default)]
struct ComputeMakeRequestId(Arc<AtomicU64>);
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
impl MakeRequestId for ComputeMakeRequestId {
fn make_request_id<B>(
&mut self,
_request: &http::Request<B>,
) -> Option<tower_http::request_id::RequestId> {
let request_id = self
.0
.fetch_add(1, Ordering::SeqCst)
.to_string()
.parse()
.unwrap();
if headers.get(X_REQUEST_ID).is_none() {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
Some(RequestId::new(request_id))
}
next.run(request).await
}
/// Run the HTTP server and wait on it forever.
#[tokio::main]
async fn serve(port: u16, compute: Arc<ComputeNode>) {
const X_REQUEST_ID: &str = "x-request-id";
let mut app = Router::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
@@ -73,8 +82,9 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
.fallback(handle_404)
.layer(
ServiceBuilder::new()
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(SetRequestIdLayer::x_request_id(
ComputeMakeRequestId::default(),
))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {

View File

@@ -9,9 +9,8 @@ use clap::{Parser, Subcommand};
use pageserver_api::{
controller_api::{
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
TenantPolicyRequest,
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
},
models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -232,13 +231,6 @@ enum Command {
},
/// List safekeepers known to the storage controller
Safekeepers {},
/// Set the scheduling policy of the specified safekeeper
SafekeeperScheduling {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
scheduling_policy: SkSchedulingPolicyArg,
},
}
#[derive(Parser)]
@@ -291,17 +283,6 @@ impl FromStr for PlacementPolicyArg {
}
}
#[derive(Debug, Clone)]
struct SkSchedulingPolicyArg(SkSchedulingPolicy);
impl FromStr for SkSchedulingPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
SkSchedulingPolicy::from_str(s).map(Self)
}
}
#[derive(Debug, Clone)]
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
@@ -1221,23 +1202,6 @@ async fn main() -> anyhow::Result<()> {
}
println!("{table}");
}
Command::SafekeeperScheduling {
node_id,
scheduling_policy,
} => {
let scheduling_policy = scheduling_policy.0;
storcon_client
.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
Method::POST,
format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
)
.await?;
println!(
"Scheduling policy of {node_id} set to {}",
String::from(scheduling_policy)
);
}
}
Ok(())

View File

@@ -1,255 +0,0 @@
#
Created on Aug 2024
Implemented on Jan 2025
## Summary
Data in large tenants is split up between multiple pageservers according to key hashes, as
introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md).
Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs,
in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives
only the data it needs.
This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth
for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards).
## Motivation
1. Large databases require higher shard counts. Whereas currently we run with up to 8 shards for tenants
with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such
that sending all WAL to all shards is impractical in terms of bandwidth.
2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each
shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck. To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver
only has to process relevant parts.
## Non Goals (if relevant)
We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's
WAL across safekeepers (beyond simple 3x replication). This RFC may be thought of as an incremental
move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the
pageserver, they will bottleneck on the safekeeper.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
Safekeeper, pageserver.
There will be no control plane or storage controller coordination needed, as pageservers will directly
indicate their sharding parameters to the safekeeper when subscribing for WAL.
## Proposed implementation
Terminology:
- "Data pages" refers to postgres relation blocks, and SLRU blocks.
- "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and
directories of relations.
### Phase 1: Refactor ingest
Currently, pageserver ingest code is structured approximately as follows:
1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
socket
2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications
3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when
its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`.
This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and
from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages
such as relation sizes and the master DBDIR page. It also assumes that records are ingested
strictly one after the other: they cannot be ingested in parallel because each record assumes
that earlier records' changes have already been applied to `Timeline`.
This code will be refactored to disentangle the simple, fast decode of relation page writes
from the more complex logic for updating internal metadata. An intermediate representation
called `InterpretedWalRecords` will be introduced. This is similar to the internal state of
a `DatadirModification`, but does not require access to a Timeline. Instead of storing
metadata updates as materialized writes to pages, it will accumulate these as abstract operations,
for example rather than including a write to a relation size key, this structure will include
an operation that indicates "Update relation _foo_'s size to the max of its current value and
_bar_", such that these may be applied later to a real Timeline.
The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates
simple page writes of relation blocks, it can write them directly into a buffer in the serialized
format. This will avoid the need to later deserialize/reserialize this data when passing the
structure between safekeeper and pageserver.
The new pipeline will be:
1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
2. A `InterpretedWalRecords` is generated from the incoming WAL records. This does not
require a reference to a Timeline.
3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating
metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords`
and turn them into literal writes to metadata pages. This part must be done sequentially.
4. The resulting buffer of metadata page writes is combined with the buffer of relation block
writes, and written into the `InMemoryLayer`.
Implemented in:
1. https://github.com/neondatabase/neon/pull/9472
2. https://github.com/neondatabase/neon/pull/9504
3. https://github.com/neondatabase/neon/pull/9524
### Phase 2: Decode & filter on safekeeper
In the previous phase, the ingest code was modified to be able to do most of its work without access to
a Timeline: this first stage of ingest simply converts a series of binary wal records into
a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes.
The modified ingest code may be transplanted from pageserver to safekeeper (probably via a
shared crate). The safekeeper->pageserver network protocol is modified to:
- in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper
- in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`.
- use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for
the subscribing shard before transmitting it.
The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of
consistent LSN feedback, and connection management. Only the payload of the subscriptions
changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the
raw data.
The ingest code on the pageserver can now skip the part where it does the first phase of
processing, as it will receive pre-processed, compressed data off the wire.
Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network
message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records
as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards.
Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks.
The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future,
we may migrate it to GRPC altogether
Implemented in:
1. https://github.com/neondatabase/neon/pull/9746
2. https://github.com/neondatabase/neon/pull/9821
### Phase 3: Fan out interpreted WAL
In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still
done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially
when considering converting to Protobuf format and compression).
To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on
the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted
_only_ once per (safekeeper, timeline).
When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned
for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends
it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection
task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader.
There's two cases to consider:
1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard
will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain
by letting shards "front-run" since compute backpressure is based on the laggard LSN.
2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader
gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive
interpreted WAL records below their current position.
The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time.
If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper
argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute
delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers,
so it's recommended to use large values to avoid pushing CPU utilisation too high.
Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll
the reader and sender concurrently from the connection task.
Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time.
This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start
position.
Implemented in:
1. https://github.com/neondatabase/neon/pull/10190
## Deployment
Each phase shall be deployed independently. Special care should be taken around protocol changes.
## Observability Tips
* The safekeeper logs the protocol requested by the pageserver
along with the pageserver ID, tenant, timeline and shard: `starting streaming from`.
* There's metrics for the number of wal readers:
* `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK
* `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK
* `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline
* Interesting log lines for the fan-out reader:
* `Spawning interpreted`: first shard creates the interpreted wal reader
* `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader
* `Aborting interpreted`: all senders have finished and the reader task is being aborted
## Future Optimizations
This sections describes some improvement areas which may be revisited in the future.
### Buffering of Interpreted WAL
The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving
subscriptions that are lagging behind the current position of the reader.
Counterpoints:
* Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful,
especially given that it would go unused on the happy path.
* WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting.
### Tweaking the Pagserver Safekeeper Selection Algorithm
We could make the pageserver aware of which safekeeper's already host shards for the timeline along
with their current WAL positions. The pageserver should then prefer safkeepers that are in the same
AZ _and_ already have a shard with a position close to the desired start position.
We currently run one safekeeper per AZ, so the point is mute until that changes.
### Pipelining first ingest phase
The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed
output per shard. To put multiple CPUs to work, we may pipeline this processing up to some defined buffer
depth.
## Alternatives considered
### Give safekeepers enough state to fully decode WAL
In this RFC, we only do the first phase of ingest on the safekeeper, because this is
the phase that is stateless. Subsequent changes then happen on the pageserver, with
access to the `Timeline` state.
We could do more work on the safekeeper if we transmitted metadata state to the safekeeper
when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes,
so that it could then generate all the metadata writes for relation sizes.
We avoid doing this for several reasons:
1. Complexity: it's a more invasive protocol change
2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat
infects it with knowledge of the pageserver, but this is mainly an abstract structure
that describes postgres writes. However, if we taught the safekeeper about the exact
way that pageserver deals with metadata keys, this would be a much tighter coupling.
3. Load: once the WAL has been processed to the point that it can be split between shards,
it is preferable to share out work on the remaining shards rather than adding extra CPU
load to the safekeeper.
### Do pre-processing on the compute instead of the safekeeper
Since our first stage of ingest is stateless, it could be done at any stage in the pipeline,
all the way up to the compute.
We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather
than just the preprocessed WAL:
- The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication
- It simplifies our paxos implementation to have the offset in the write log be literally
the same as the LSN
- Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future.
Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol.
### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there
If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then
we could do the initial decode and shard-splitting in some other location:
- Shard 0 could subscribe to the full WAL and then send writes to other shards
- A new intermediate service between the safekeeper and pageserver could do the splitting.
So why not?
- Extra network hop from shard 0 to the final destination shard
- Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper.
- Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate
disks for the safekeepers effectively have "free" CPU resources.
- Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because
shard 0 would have significantly higher CPU load under write workloads than other shards.

View File

@@ -324,7 +324,7 @@ impl From<NodeSchedulingPolicy> for String {
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum SkSchedulingPolicy {
Active,
Pause,
Disabled,
Decomissioned,
}
@@ -334,13 +334,9 @@ impl FromStr for SkSchedulingPolicy {
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"active" => Self::Active,
"pause" => Self::Pause,
"disabled" => Self::Disabled,
"decomissioned" => Self::Decomissioned,
_ => {
return Err(anyhow::anyhow!(
"Unknown scheduling policy '{s}', try active,pause,decomissioned"
))
}
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
})
}
}
@@ -350,7 +346,7 @@ impl From<SkSchedulingPolicy> for String {
use SkSchedulingPolicy::*;
match value {
Active => "active",
Pause => "pause",
Disabled => "disabled",
Decomissioned => "decomissioned",
}
.to_string()
@@ -420,6 +416,8 @@ pub struct MetadataHealthListOutdatedResponse {
}
/// Publicly exposed safekeeper description
///
/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperDescribeResponse {
pub id: NodeId,
@@ -435,11 +433,6 @@ pub struct SafekeeperDescribeResponse {
pub scheduling_policy: SkSchedulingPolicy,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperSchedulingPolicyRequest {
pub scheduling_policy: SkSchedulingPolicy,
}
#[cfg(test)]
mod test {
use super::*;

View File

@@ -24,9 +24,7 @@ pub struct Key {
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
/// a struct of fields.
#[derive(
Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
)]
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
pub struct CompactKey(i128);
/// The storage key size.

View File

@@ -29,10 +29,11 @@ use utils::{
};
use crate::{
key::{CompactKey, Key},
key::Key,
reltag::RelTag,
shard::{ShardCount, ShardStripeSize, TenantShardId},
};
use anyhow::bail;
use bytes::{Buf, BufMut, Bytes, BytesMut};
/// The state of a tenant in this pageserver.
@@ -1399,8 +1400,6 @@ pub enum PagestreamFeMessage {
GetPage(PagestreamGetPageRequest),
DbSize(PagestreamDbSizeRequest),
GetSlruSegment(PagestreamGetSlruSegmentRequest),
#[cfg(feature = "testing")]
Test(PagestreamTestRequest),
}
// Wrapped in libpq CopyData
@@ -1412,22 +1411,6 @@ pub enum PagestreamBeMessage {
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
GetSlruSegment(PagestreamGetSlruSegmentResponse),
#[cfg(feature = "testing")]
Test(PagestreamTestResponse),
}
// Keep in sync with `pagestore_client.h`
#[repr(u8)]
enum PagestreamFeMessageTag {
Exists = 0,
Nblocks = 1,
GetPage = 2,
DbSize = 3,
GetSlruSegment = 4,
/* future tags above this line */
/// For testing purposes, not available in production.
#[cfg(feature = "testing")]
Test = 99,
}
// Keep in sync with `pagestore_client.h`
@@ -1439,28 +1422,7 @@ enum PagestreamBeMessageTag {
Error = 103,
DbSize = 104,
GetSlruSegment = 105,
/* future tags above this line */
/// For testing purposes, not available in production.
#[cfg(feature = "testing")]
Test = 199,
}
impl TryFrom<u8> for PagestreamFeMessageTag {
type Error = u8;
fn try_from(value: u8) -> Result<Self, u8> {
match value {
0 => Ok(PagestreamFeMessageTag::Exists),
1 => Ok(PagestreamFeMessageTag::Nblocks),
2 => Ok(PagestreamFeMessageTag::GetPage),
3 => Ok(PagestreamFeMessageTag::DbSize),
4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
#[cfg(feature = "testing")]
99 => Ok(PagestreamFeMessageTag::Test),
_ => Err(value),
}
}
}
impl TryFrom<u8> for PagestreamBeMessageTag {
type Error = u8;
fn try_from(value: u8) -> Result<Self, u8> {
@@ -1471,8 +1433,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
103 => Ok(PagestreamBeMessageTag::Error),
104 => Ok(PagestreamBeMessageTag::DbSize),
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
#[cfg(feature = "testing")]
199 => Ok(PagestreamBeMessageTag::Test),
_ => Err(value),
}
}
@@ -1590,20 +1550,6 @@ pub struct PagestreamDbSizeResponse {
pub db_size: i64,
}
#[cfg(feature = "testing")]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct PagestreamTestRequest {
pub hdr: PagestreamRequest,
pub batch_key: u64,
pub message: String,
}
#[cfg(feature = "testing")]
#[derive(Debug)]
pub struct PagestreamTestResponse {
pub req: PagestreamTestRequest,
}
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
// that require pageserver-internal types. It is sufficient to get the total size.
#[derive(Serialize, Deserialize, Debug)]
@@ -1623,7 +1569,7 @@ impl PagestreamFeMessage {
match self {
Self::Exists(req) => {
bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
bytes.put_u8(0);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1634,7 +1580,7 @@ impl PagestreamFeMessage {
}
Self::Nblocks(req) => {
bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
bytes.put_u8(1);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1645,7 +1591,7 @@ impl PagestreamFeMessage {
}
Self::GetPage(req) => {
bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
bytes.put_u8(2);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1657,7 +1603,7 @@ impl PagestreamFeMessage {
}
Self::DbSize(req) => {
bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
bytes.put_u8(3);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1665,24 +1611,13 @@ impl PagestreamFeMessage {
}
Self::GetSlruSegment(req) => {
bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
bytes.put_u8(4);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
#[cfg(feature = "testing")]
Self::Test(req) => {
bytes.put_u8(PagestreamFeMessageTag::Test as u8);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u64(req.batch_key);
let message = req.message.as_bytes();
bytes.put_u64(message.len() as u64);
bytes.put_slice(message);
}
}
bytes.into()
@@ -1710,66 +1645,56 @@ impl PagestreamFeMessage {
),
};
match PagestreamFeMessageTag::try_from(msg_tag)
.map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
{
PagestreamFeMessageTag::Exists => {
Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
}))
}
PagestreamFeMessageTag::Nblocks => {
Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
}))
}
PagestreamFeMessageTag::GetPage => {
Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
blkno: body.read_u32::<BigEndian>()?,
}))
}
PagestreamFeMessageTag::DbSize => {
Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
}))
}
PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
blkno: body.read_u32::<BigEndian>()?,
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
dbnode: body.read_u32::<BigEndian>()?,
})),
4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest {
hdr: PagestreamRequest {
reqid,
@@ -1780,21 +1705,7 @@ impl PagestreamFeMessage {
segno: body.read_u32::<BigEndian>()?,
},
)),
#[cfg(feature = "testing")]
PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
batch_key: body.read_u64::<BigEndian>()?,
message: {
let len = body.read_u64::<BigEndian>()?;
let mut buf = vec![0; len as usize];
body.read_exact(&mut buf)?;
String::from_utf8(buf)?
},
})),
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
}
}
}
@@ -1837,15 +1748,6 @@ impl PagestreamBeMessage {
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
#[cfg(feature = "testing")]
Self::Test(resp) => {
bytes.put_u8(Tag::Test as u8);
bytes.put_u64(resp.req.batch_key);
let message = resp.req.message.as_bytes();
bytes.put_u64(message.len() as u64);
bytes.put_slice(message);
}
}
}
PagestreamProtocolVersion::V3 => {
@@ -1914,18 +1816,6 @@ impl PagestreamBeMessage {
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
#[cfg(feature = "testing")]
Self::Test(resp) => {
bytes.put_u8(Tag::Test as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u64(resp.req.batch_key);
let message = resp.req.message.as_bytes();
bytes.put_u64(message.len() as u64);
bytes.put_slice(message);
}
}
}
}
@@ -2068,28 +1958,6 @@ impl PagestreamBeMessage {
segment: segment.into(),
})
}
#[cfg(feature = "testing")]
Tag::Test => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let batch_key = buf.read_u64::<BigEndian>()?;
let len = buf.read_u64::<BigEndian>()?;
let mut msg = vec![0; len as usize];
buf.read_exact(&mut msg)?;
let message = String::from_utf8(msg)?;
Self::Test(PagestreamTestResponse {
req: PagestreamTestRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
batch_key,
message,
},
})
}
};
let remaining = buf.into_inner();
if !remaining.is_empty() {
@@ -2109,25 +1977,6 @@ impl PagestreamBeMessage {
Self::Error(_) => "Error",
Self::DbSize(_) => "DbSize",
Self::GetSlruSegment(_) => "GetSlruSegment",
#[cfg(feature = "testing")]
Self::Test(_) => "Test",
}
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct PageTraceEvent {
pub key: CompactKey,
pub effective_lsn: Lsn,
pub time: SystemTime,
}
impl Default for PageTraceEvent {
fn default() -> Self {
Self {
key: Default::default(),
effective_lsn: Default::default(),
time: std::time::UNIX_EPOCH,
}
}
}

View File

@@ -13,4 +13,3 @@ postgres_ffi.workspace = true
pq_proto.workspace = true
tokio.workspace = true
utils.workspace = true
pageserver_api.workspace = true

View File

@@ -38,14 +38,12 @@ impl Display for SafekeeperId {
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(transparent)]
pub struct MemberSet {
pub members: Vec<SafekeeperId>,
pub m: Vec<SafekeeperId>,
}
impl MemberSet {
pub fn empty() -> Self {
MemberSet {
members: Vec::new(),
}
MemberSet { m: Vec::new() }
}
pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
@@ -53,11 +51,11 @@ impl MemberSet {
if hs.len() != members.len() {
bail!("duplicate safekeeper id in the set {:?}", members);
}
Ok(MemberSet { members })
Ok(MemberSet { m: members })
}
pub fn contains(&self, sk: &SafekeeperId) -> bool {
self.members.iter().any(|m| m.id == sk.id)
self.m.iter().any(|m| m.id == sk.id)
}
pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
@@ -67,7 +65,7 @@ impl MemberSet {
sk.id, self
));
}
self.members.push(sk);
self.m.push(sk);
Ok(())
}
}
@@ -75,11 +73,7 @@ impl MemberSet {
impl Display for MemberSet {
/// Display as a comma separated list of members.
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let sks_str = self
.members
.iter()
.map(|m| m.to_string())
.collect::<Vec<_>>();
let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
write!(f, "({})", sks_str.join(", "))
}
}

View File

@@ -1,6 +1,5 @@
//! Types used in safekeeper http API. Many of them are also reused internally.
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::TimestampTz;
use serde::{Deserialize, Serialize};
use std::net::SocketAddr;
@@ -147,13 +146,7 @@ pub type ConnectionId = u32;
/// Serialize is used only for json'ing in API response. Also used internally.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum WalSenderState {
Vanilla(VanillaWalSenderState),
Interpreted(InterpretedWalSenderState),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VanillaWalSenderState {
pub struct WalSenderState {
pub ttid: TenantTimelineId,
pub addr: SocketAddr,
pub conn_id: ConnectionId,
@@ -162,17 +155,6 @@ pub struct VanillaWalSenderState {
pub feedback: ReplicationFeedback,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InterpretedWalSenderState {
pub ttid: TenantTimelineId,
pub shard: ShardIdentity,
pub addr: SocketAddr,
pub conn_id: ConnectionId,
// postgres application_name
pub appname: Option<String>,
pub feedback: ReplicationFeedback,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalReceiverState {
/// None means it is recovery initiated by us (this safekeeper).
@@ -277,8 +259,3 @@ pub struct TimelineTermBumpResponse {
pub previous_term: u64,
pub current_term: u64,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SafekeeperUtilization {
pub timeline_count: u64,
}

View File

@@ -1,54 +0,0 @@
//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
//! don't block reads.
use arc_swap::ArcSwap;
use std::sync::Arc;
use tokio::sync::TryLockError;
pub struct GuardArcSwap<T> {
inner: ArcSwap<T>,
guard: tokio::sync::Mutex<()>,
}
pub struct Guard<'a, T> {
_guard: tokio::sync::MutexGuard<'a, ()>,
inner: &'a ArcSwap<T>,
}
impl<T> GuardArcSwap<T> {
pub fn new(inner: T) -> Self {
Self {
inner: ArcSwap::new(Arc::new(inner)),
guard: tokio::sync::Mutex::new(()),
}
}
pub fn read(&self) -> Arc<T> {
self.inner.load_full()
}
pub async fn write_guard(&self) -> Guard<'_, T> {
Guard {
_guard: self.guard.lock().await,
inner: &self.inner,
}
}
pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
let guard = self.guard.try_lock()?;
Ok(Guard {
_guard: guard,
inner: &self.inner,
})
}
}
impl<T> Guard<'_, T> {
pub fn read(&self) -> Arc<T> {
self.inner.load_full()
}
pub fn write(&mut self, value: T) {
self.inner.store(Arc::new(value));
}
}

View File

@@ -98,8 +98,6 @@ pub mod try_rcu;
pub mod pprof;
pub mod guard_arc_swap;
// Re-export used in macro. Avoids adding git-version as dep in target crates.
#[doc(hidden)]
pub use git_version;

View File

@@ -64,7 +64,7 @@ pub struct InterpretedWalRecords {
}
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct InterpretedWalRecord {
/// Optional metadata record - may cause writes to metadata keys
/// in the storage engine

View File

@@ -32,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
/// relation sizes. In the case of "observed" values, we only need to know
/// the key and LSN, so two types of metadata are supported to save on network
/// bandwidth.
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub enum ValueMeta {
Serialized(SerializedValueMeta),
Observed(ObservedValueMeta),
@@ -79,7 +79,7 @@ impl PartialEq for OrderedValueMeta {
impl Eq for OrderedValueMeta {}
/// Metadata for a [`Value`] serialized into the batch.
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct SerializedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
@@ -91,14 +91,14 @@ pub struct SerializedValueMeta {
}
/// Metadata for a [`Value`] observed by the batch
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct ObservedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
}
/// Batch of serialized [`Value`]s.
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct SerializedValueBatch {
/// [`Value`]s serialized in EphemeralFile's native format,
/// ready for disk write by the pageserver

View File

@@ -215,6 +215,7 @@ impl Wrapper {
syncSafekeepers: config.sync_safekeepers,
systemId: 0,
pgTimeline: 1,
proto_version: 2,
callback_data,
};
let c_config = Box::into_raw(Box::new(c_config));

View File

@@ -8,7 +8,7 @@ license.workspace = true
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
[dependencies]
anyhow.workspace = true
@@ -16,7 +16,6 @@ arc-swap.workspace = true
async-compression.workspace = true
async-stream.workspace = true
bit_field.workspace = true
bincode.workspace = true
byteorder.workspace = true
bytes.workspace = true
camino.workspace = true
@@ -114,7 +113,3 @@ harness = false
[[bench]]
name = "upload_queue"
harness = false
[[bin]]
name = "test_helper_slow_client_reads"
required-features = [ "testing" ]

View File

@@ -4,9 +4,6 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
testing = [ "pageserver_api/testing" ]
[dependencies]
pageserver_api.workspace = true
thiserror.workspace = true

View File

@@ -1,9 +1,6 @@
use std::sync::{Arc, Mutex};
use std::pin::Pin;
use futures::{
stream::{SplitSink, SplitStream},
SinkExt, StreamExt,
};
use futures::SinkExt;
use pageserver_api::{
models::{
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
@@ -13,6 +10,7 @@ use pageserver_api::{
};
use tokio::task::JoinHandle;
use tokio_postgres::CopyOutStream;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
use utils::{
id::{TenantId, TimelineId},
@@ -64,28 +62,15 @@ impl Client {
.client
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
.await?;
let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
let Client {
cancel_on_client_drop,
conn_task,
client: _,
} = self;
let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
ConnTaskRunning {
cancel_on_client_drop,
conn_task,
},
)));
Ok(PagestreamClient {
sink: PagestreamSender {
shared: shared.clone(),
sink,
},
stream: PagestreamReceiver {
shared: shared.clone(),
stream,
},
shared,
copy_both: Box::pin(copy_both),
conn_task,
cancel_on_client_drop,
})
}
@@ -112,28 +97,7 @@ impl Client {
/// Create using [`Client::pagestream`].
pub struct PagestreamClient {
shared: Arc<Mutex<PagestreamShared>>,
sink: PagestreamSender,
stream: PagestreamReceiver,
}
pub struct PagestreamSender {
#[allow(dead_code)]
shared: Arc<Mutex<PagestreamShared>>,
sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
}
pub struct PagestreamReceiver {
#[allow(dead_code)]
shared: Arc<Mutex<PagestreamShared>>,
stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
}
enum PagestreamShared {
ConnTaskRunning(ConnTaskRunning),
ConnTaskCancelledJoinHandleReturnedOrDropped,
}
struct ConnTaskRunning {
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
conn_task: JoinHandle<()>,
}
@@ -146,11 +110,11 @@ pub struct RelTagBlockNo {
impl PagestreamClient {
pub async fn shutdown(self) {
let Self {
shared,
sink,
stream,
} = { self };
// The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
copy_both,
cancel_on_client_drop: cancel_conn_task,
conn_task,
} = self;
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
//
@@ -167,77 +131,27 @@ impl PagestreamClient {
//
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
// => https://github.com/neondatabase/neon/issues/6390
let ConnTaskRunning {
cancel_on_client_drop,
conn_task,
} = {
let mut guard = shared.lock().unwrap();
match std::mem::replace(
&mut *guard,
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
) {
PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
}
};
let _ = cancel_on_client_drop.unwrap();
let _ = cancel_conn_task.unwrap();
conn_task.await.unwrap();
// Now drop the split copy_both.
drop(sink);
drop(stream);
}
pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
let Self {
shared: _,
sink,
stream,
} = self;
(sink, stream)
drop(copy_both);
}
pub async fn getpage(
&mut self,
req: PagestreamGetPageRequest,
) -> anyhow::Result<PagestreamGetPageResponse> {
self.getpage_send(req).await?;
self.getpage_recv().await
}
let req = PagestreamFeMessage::GetPage(req);
let req: bytes::Bytes = req.serialize();
// let mut req = tokio_util::io::ReaderStream::new(&req);
let mut req = tokio_stream::once(Ok(req));
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
self.sink.getpage_send(req).await
}
self.copy_both.send_all(&mut req).await?;
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
self.stream.getpage_recv().await
}
}
impl PagestreamSender {
// TODO: maybe make this impl Sink instead for better composability?
pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
let msg = msg.serialize();
self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
Ok(())
}
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
self.send(PagestreamFeMessage::GetPage(req)).await
}
}
impl PagestreamReceiver {
// TODO: maybe make this impl Stream instead for better composability?
pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
let next: bytes::Bytes = next.unwrap()?;
PagestreamBeMessage::deserialize(next)
}
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
let next: PagestreamBeMessage = self.recv().await?;
match next {
let msg = PagestreamBeMessage::deserialize(next)?;
match msg {
PagestreamBeMessage::GetPage(p) => Ok(p),
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
PagestreamBeMessage::Exists(_)
@@ -246,14 +160,7 @@ impl PagestreamReceiver {
| PagestreamBeMessage::GetSlruSegment(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
next.kind()
)
}
#[cfg(feature = "testing")]
PagestreamBeMessage::Test(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
next.kind()
msg.kind()
)
}
}

View File

@@ -8,11 +8,9 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
bincode.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["string"] }
humantime.workspace = true
itertools.workspace = true
pageserver = { path = ".." }
pageserver_api.workspace = true
remote_storage = { path = "../../libs/remote_storage" }

View File

@@ -9,9 +9,7 @@ mod index_part;
mod key;
mod layer_map_analyzer;
mod layers;
mod page_trace;
use page_trace::PageTraceCmd;
use std::{
str::FromStr,
time::{Duration, SystemTime},
@@ -66,7 +64,6 @@ enum Commands {
Layer(LayerCmd),
/// Debug print a hex key found from logs
Key(key::DescribeKeyCommand),
PageTrace(PageTraceCmd),
}
/// Read and update pageserver metadata file
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
.await?;
}
Commands::Key(dkc) => dkc.execute(),
Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
};
Ok(())
}

View File

@@ -1,73 +0,0 @@
use std::collections::HashMap;
use std::io::BufReader;
use camino::Utf8PathBuf;
use clap::Parser;
use itertools::Itertools as _;
use pageserver_api::key::{CompactKey, Key};
use pageserver_api::models::PageTraceEvent;
use pageserver_api::reltag::RelTag;
/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats.
#[derive(Parser)]
pub(crate) struct PageTraceCmd {
/// Trace input file.
path: Utf8PathBuf,
}
pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> {
let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?);
let mut events: Vec<PageTraceEvent> = Vec::new();
loop {
match bincode::deserialize_from(&mut file) {
Ok(event) => events.push(event),
Err(err) => {
if let bincode::ErrorKind::Io(ref err) = *err {
if err.kind() == std::io::ErrorKind::UnexpectedEof {
break;
}
}
return Err(err.into());
}
}
}
let mut reads_by_relation: HashMap<RelTag, i64> = HashMap::new();
let mut reads_by_key: HashMap<CompactKey, i64> = HashMap::new();
for event in events {
let key = Key::from_compact(event.key);
let reltag = RelTag {
spcnode: key.field2,
dbnode: key.field3,
relnode: key.field4,
forknum: key.field5,
};
*reads_by_relation.entry(reltag).or_default() += 1;
*reads_by_key.entry(event.key).or_default() += 1;
}
let multi_read_keys = reads_by_key
.into_iter()
.filter(|(_, count)| *count > 1)
.sorted_by_key(|(key, count)| (-*count, *key))
.collect_vec();
println!("Multi-read keys: {}", multi_read_keys.len());
for (key, count) in multi_read_keys {
println!(" {key}: {count}");
}
let reads_by_relation = reads_by_relation
.into_iter()
.sorted_by_key(|(rel, count)| (-*count, *rel))
.collect_vec();
println!("Reads by relation:");
for (reltag, count) in reads_by_relation {
println!(" {reltag}: {count}");
}
Ok(())
}

View File

@@ -1,65 +0,0 @@
use std::{
io::{stdin, stdout, Read, Write},
time::Duration,
};
use clap::Parser;
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
#[derive(clap::Parser)]
struct Args {
connstr: String,
tenant_id: TenantId,
timeline_id: TimelineId,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let Args {
connstr,
tenant_id,
timeline_id,
} = Args::parse();
let client = pageserver_client::page_service::Client::new(connstr).await?;
let client = client.pagestream(tenant_id, timeline_id).await?;
let (mut sender, _receiver) = client.split();
eprintln!("filling the pipe");
let mut msg = 0;
loop {
msg += 1;
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
PagestreamTestRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(23),
not_modified_since: Lsn(23),
},
batch_key: 42,
message: format!("message {}", msg),
},
));
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
eprintln!("pipe seems full");
break;
};
let _: () = res?;
}
let n = stdout().write(b"R")?;
assert_eq!(n, 1);
stdout().flush()?;
eprintln!("waiting for signal to tell us to exit");
let mut buf = [0u8; 1];
stdin().read_exact(&mut buf)?;
eprintln!("termination signal received, exiting");
anyhow::Ok(())
}

View File

@@ -27,7 +27,6 @@ use pageserver_api::models::LocationConfigMode;
use pageserver_api::models::LsnLease;
use pageserver_api::models::LsnLeaseRequest;
use pageserver_api::models::OffloadedTimelineInfo;
use pageserver_api::models::PageTraceEvent;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantConfigPatchRequest;
use pageserver_api::models::TenantDetails;
@@ -52,9 +51,7 @@ use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
use remote_storage::GenericRemoteStorage;
use remote_storage::TimeTravelError;
use scopeguard::defer;
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
use tokio::time::Instant;
use tokio_util::io::StreamReader;
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -1524,71 +1521,6 @@ async fn timeline_gc_unblocking_handler(
block_or_unblock_gc(request, false).await
}
/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding.
/// Use the `pagectl page-trace` command to decode and analyze the output.
async fn timeline_page_trace_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let state = get_state(&request);
check_permission(&request, None)?;
let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024);
let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5);
// Convert size limit to event limit based on the serialized size of an event. The event size is
// fixed, as the default bincode serializer uses fixed-width integer encoding.
let event_size = bincode::serialize(&PageTraceEvent::default())
.map_err(|err| ApiError::InternalServerError(err.into()))?
.len();
let event_limit = size_limit / event_size;
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
// Install a page trace, unless one is already in progress. We just use a buffered channel,
// which may 2x the memory usage in the worst case, but it's still bounded.
let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit);
let cur = timeline.page_trace.load();
let installed = cur.is_none()
&& timeline
.page_trace
.compare_and_swap(cur, Some(Arc::new(trace_tx)))
.is_none();
if !installed {
return Err(ApiError::Conflict("page trace already active".to_string()));
}
defer!(timeline.page_trace.store(None)); // uninstall on return
// Collect the trace and return it to the client. We could stream the response, but this is
// simple and fine.
let mut body = Vec::with_capacity(size_limit);
let deadline = Instant::now() + Duration::from_secs(time_limit_secs);
while body.len() < size_limit {
tokio::select! {
event = trace_rx.recv() => {
let Some(event) = event else {
break; // shouldn't happen (sender doesn't close, unless timeline dropped)
};
bincode::serialize_into(&mut body, &event)
.map_err(|err| ApiError::InternalServerError(err.into()))?;
}
_ = tokio::time::sleep_until(deadline) => break, // time limit reached
_ = cancel.cancelled() => return Err(ApiError::Cancelled),
}
}
Ok(Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(body))
.unwrap())
}
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
///
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
@@ -2923,33 +2855,6 @@ async fn get_utilization(
.map_err(ApiError::InternalServerError)
}
async fn get_taskdump(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
use std::io::Write;
use tokio::runtime::Handle;
let out = async {
let mut out = Vec::new();
// Inside an async block or function.
let handle = Handle::current();
let dump = handle.dump().await;
for (i, task) in dump.tasks().iter().enumerate() {
let trace = task.trace();
writeln!(out, "TASK {i}:")?;
writeln!(out, "{trace}\n")?;
}
Ok(String::from_utf8(out).unwrap())
}
.await
.map_err(|e: anyhow::Error| ApiError::InternalServerError(e))?;
json_response(StatusCode::OK, out)
}
async fn list_aux_files(
mut request: Request<Body>,
_cancel: CancellationToken,
@@ -3574,10 +3479,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|r| api_handler(r, timeline_gc_unblocking_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace",
|r| api_handler(r, timeline_page_trace_handler),
)
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
api_handler(r, secondary_upload_handler)
})
@@ -3635,9 +3536,5 @@ pub fn make_router(
"/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
|r| api_handler(r, put_tenant_timeline_import_wal),
)
.get(
"/debug/taskdump",
|r| api_handler(r, get_taskdump),
)
.any(handler_404))
}

View File

@@ -100,32 +100,6 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_concurrent_initdb",
"Number of initdb processes running"
)
.expect("failed to define a metric")
});
pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_initdb_semaphore_seconds_global",
"Time spent getting a permit from the global initdb semaphore",
STORAGE_OP_BUCKETS.into()
)
.expect("failed to define metric")
});
pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_initdb_seconds_global",
"Time spent performing initdb",
STORAGE_OP_BUCKETS.into()
)
.expect("failed to define metric")
});
// Metrics collected on operations on the storage repository.
#[derive(
Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
@@ -1489,8 +1463,6 @@ pub enum SmgrQueryType {
GetPageAtLsn,
GetDbSize,
GetSlruSegment,
#[cfg(feature = "testing")]
Test,
}
pub(crate) struct SmgrQueryTimePerTimeline {

View File

@@ -67,7 +67,6 @@ use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::{basebackup, timed_after_cancellation};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::models::PageTraceEvent;
use pageserver_api::reltag::SlruKind;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
@@ -555,52 +554,37 @@ struct BatchedGetPageRequest {
timer: SmgrOpTimer,
}
#[cfg(feature = "testing")]
struct BatchedTestRequest {
req: models::PagestreamTestRequest,
timer: SmgrOpTimer,
}
/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
/// so that we don't keep the [`Timeline::gate`] open while the batch
/// is being built up inside the [`spsc_fold`] (pagestream pipelining).
enum BatchedFeMessage {
Exists {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
shard: timeline::handle::Handle<TenantManagerTypes>,
req: models::PagestreamExistsRequest,
},
Nblocks {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
shard: timeline::handle::Handle<TenantManagerTypes>,
req: models::PagestreamNblocksRequest,
},
GetPage {
span: Span,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
shard: timeline::handle::Handle<TenantManagerTypes>,
effective_request_lsn: Lsn,
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
},
DbSize {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
shard: timeline::handle::Handle<TenantManagerTypes>,
req: models::PagestreamDbSizeRequest,
},
GetSlruSegment {
span: Span,
timer: SmgrOpTimer,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
shard: timeline::handle::Handle<TenantManagerTypes>,
req: models::PagestreamGetSlruSegmentRequest,
},
#[cfg(feature = "testing")]
Test {
span: Span,
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
requests: Vec<BatchedTestRequest>,
},
RespondError {
span: Span,
error: BatchedPageStreamError,
@@ -621,12 +605,6 @@ impl BatchedFeMessage {
page.timer.observe_execution_start(at);
}
}
#[cfg(feature = "testing")]
BatchedFeMessage::Test { requests, .. } => {
for req in requests {
req.timer.observe_execution_start(at);
}
}
BatchedFeMessage::RespondError { .. } => {}
}
}
@@ -756,7 +734,7 @@ impl PageServerHandler {
BatchedFeMessage::Exists {
span,
timer,
shard: shard.downgrade(),
shard,
req,
}
}
@@ -775,7 +753,7 @@ impl PageServerHandler {
BatchedFeMessage::Nblocks {
span,
timer,
shard: shard.downgrade(),
shard,
req,
}
}
@@ -794,7 +772,7 @@ impl PageServerHandler {
BatchedFeMessage::DbSize {
span,
timer,
shard: shard.downgrade(),
shard,
req,
}
}
@@ -813,7 +791,7 @@ impl PageServerHandler {
BatchedFeMessage::GetSlruSegment {
span,
timer,
shard: shard.downgrade(),
shard,
req,
}
}
@@ -865,7 +843,6 @@ impl PageServerHandler {
)
.await?;
// We're holding the Handle
let effective_request_lsn = match Self::wait_or_get_last_lsn(
&shard,
req.hdr.request_lsn,
@@ -883,27 +860,11 @@ impl PageServerHandler {
};
BatchedFeMessage::GetPage {
span,
shard: shard.downgrade(),
shard,
effective_request_lsn,
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
}
}
#[cfg(feature = "testing")]
PagestreamFeMessage::Test(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
.await?;
let timer =
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
.await?;
BatchedFeMessage::Test {
span,
shard: shard.downgrade(),
requests: vec![BatchedTestRequest { req, timer }],
}
}
};
Ok(Some(batched_msg))
}
@@ -945,7 +906,9 @@ impl PageServerHandler {
assert_eq!(accum_pages.len(), max_batch_size.get());
return false;
}
if !accum_shard.is_same_handle_as(&this_shard) {
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
{
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
@@ -964,44 +927,6 @@ impl PageServerHandler {
accum_pages.extend(this_pages);
Ok(())
}
#[cfg(feature = "testing")]
(
Ok(BatchedFeMessage::Test {
shard: accum_shard,
requests: accum_requests,
..
}),
BatchedFeMessage::Test {
shard: this_shard,
requests: this_requests,
..
},
) if (|| {
assert!(this_requests.len() == 1);
if accum_requests.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_requests.len(), max_batch_size.get());
return false;
}
if !accum_shard.is_same_handle_as(&this_shard) {
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return false;
}
let this_batch_key = this_requests[0].req.batch_key;
let accum_batch_key = accum_requests[0].req.batch_key;
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
return false;
}
true
})() =>
{
// ok to batch
accum_requests.extend(this_requests);
Ok(())
}
// something batched already but this message is unbatchable
(_, this_msg) => {
// by default, don't continue batching
@@ -1043,7 +968,7 @@ impl PageServerHandler {
fail::fail_point!("ps::handle-pagerequest-message::exists");
(
vec![self
.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
.handle_get_rel_exists_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1060,7 +985,7 @@ impl PageServerHandler {
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
(
vec![self
.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
.handle_get_nblocks_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1081,7 +1006,7 @@ impl PageServerHandler {
trace!(npages, "handling getpage request");
let res = self
.handle_get_page_at_lsn_request_batched(
&*shard.upgrade()?,
&shard,
effective_request_lsn,
pages,
ctx,
@@ -1103,7 +1028,7 @@ impl PageServerHandler {
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
(
vec![self
.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
.handle_db_size_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1120,7 +1045,7 @@ impl PageServerHandler {
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
(
vec![self
.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
.handle_get_slru_segment_request(&shard, &req, ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1128,27 +1053,6 @@ impl PageServerHandler {
span,
)
}
#[cfg(feature = "testing")]
BatchedFeMessage::Test {
span,
shard,
requests,
} => {
fail::fail_point!("ps::handle-pagerequest-message::test");
(
{
let npages = requests.len();
trace!(npages, "handling getpage request");
let res = self
.handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
.instrument(span.clone())
.await;
assert_eq!(res.len(), npages);
res
},
span,
)
}
BatchedFeMessage::RespondError { span, error } => {
// We've already decided to respond with an error, so we don't need to
// call the handler.
@@ -1814,20 +1718,6 @@ impl PageServerHandler {
.query_metrics
.observe_getpage_batch_start(requests.len());
// If a page trace is running, submit an event for this request.
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
let time = SystemTime::now();
for batch in &requests {
let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact();
// Ignore error (trace buffer may be full or tracer may have disconnected).
_ = page_trace.try_send(PageTraceEvent {
key,
effective_lsn,
time,
});
}
}
let results = timeline
.get_rel_page_at_lsn_batched(
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
@@ -1886,51 +1776,6 @@ impl PageServerHandler {
))
}
// NB: this impl mimics what we do for batched getpage requests.
#[cfg(feature = "testing")]
#[instrument(skip_all, fields(shard_id))]
async fn handle_test_request_batch(
&mut self,
timeline: &Timeline,
requests: Vec<BatchedTestRequest>,
_ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
// real requests would do something with the timeline
let mut results = Vec::with_capacity(requests.len());
for _req in requests.iter() {
tokio::task::yield_now().await;
results.push({
if timeline.cancel.is_cancelled() {
Err(PageReconstructError::Cancelled)
} else {
Ok(())
}
});
}
// TODO: avoid creating the new Vec here
Vec::from_iter(
requests
.into_iter()
.zip(results.into_iter())
.map(|(req, res)| {
res.map(|()| {
(
PagestreamBeMessage::Test(models::PagestreamTestResponse {
req: req.req.clone(),
}),
req.timer,
)
})
.map_err(|e| BatchedPageStreamError {
err: PageStreamError::from(e),
req: req.req.hdr,
})
}),
)
}
/// Note on "fullbackup":
/// Full basebackups should only be used for debugging purposes.
/// Originally, it was introduced to enable breaking storage format changes,
@@ -2546,14 +2391,6 @@ impl From<GetActiveTimelineError> for QueryError {
}
}
impl From<crate::tenant::timeline::handle::HandleUpgradeError> for QueryError {
fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self {
match e {
crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown,
}
}
}
fn set_tracing_field_shard_id(timeline: &Timeline) {
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
tracing::Span::current().record(

View File

@@ -95,9 +95,6 @@ use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::CONCURRENT_INITDBS;
use crate::metrics::INITDB_RUN_TIME;
use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME;
use crate::metrics::TENANT;
use crate::metrics::{
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
@@ -5350,17 +5347,8 @@ async fn run_initdb(
initdb_bin_path, initdb_target_dir, initdb_lib_dir,
);
let _permit = {
let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
INIT_DB_SEMAPHORE.acquire().await
};
let _permit = INIT_DB_SEMAPHORE.acquire().await;
CONCURRENT_INITDBS.inc();
scopeguard::defer! {
CONCURRENT_INITDBS.dec();
}
let _timer = INITDB_RUN_TIME.start_timer();
let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser: &conf.superuser,
locale: &conf.locale,

View File

@@ -382,12 +382,6 @@ pub(crate) struct RemoteTimelineClient {
cancel: CancellationToken,
}
impl Drop for RemoteTimelineClient {
fn drop(&mut self) {
debug!("dropping RemoteTimelineClient");
}
}
impl RemoteTimelineClient {
///
/// Create a remote storage client for given timeline
@@ -803,12 +797,6 @@ impl RemoteTimelineClient {
upload_queue.dirty.metadata.apply(update);
// Defense in depth: if we somehow generated invalid metadata, do not persist it.
upload_queue
.dirty
.validate()
.map_err(|e| anyhow::anyhow!(e))?;
self.schedule_index_upload(upload_queue);
Ok(())

View File

@@ -152,21 +152,6 @@ impl IndexPart {
};
is_same_remote_layer_path(name, metadata, name, index_metadata)
}
/// Check for invariants in the index: this is useful when uploading an index to ensure that if
/// we encounter a bug, we do not persist buggy metadata.
pub(crate) fn validate(&self) -> Result<(), String> {
if self.import_pgdata.is_none()
&& self.metadata.ancestor_timeline().is_none()
&& self.layer_metadata.is_empty()
{
// Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
// always have at least one layer.
return Err("Index has no ancestor and no layers".to_string());
}
Ok(())
}
}
/// Metadata gathered for each of the layer files.

View File

@@ -40,10 +40,6 @@ pub(crate) async fn upload_index_part(
});
pausable_failpoint!("before-upload-index-pausable");
// Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this
// (this should never happen)
index_part.validate().map_err(|e| anyhow::anyhow!(e))?;
// FIXME: this error comes too late
let serialized = index_part.to_json_bytes()?;
let serialized = Bytes::from(serialized);

View File

@@ -1,6 +1,6 @@
use std::time::UNIX_EPOCH;
use pageserver_api::key::{Key, CONTROLFILE_KEY};
use pageserver_api::key::CONTROLFILE_KEY;
use tokio::task::JoinSet;
use utils::{
completion::{self, Completion},
@@ -9,10 +9,7 @@ use utils::{
use super::failpoints::{Failpoint, FailpointKind};
use super::*;
use crate::{
context::DownloadBehavior,
tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
};
use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
/// Used in tests to advance a future to wanted await point, and not futher.
@@ -34,51 +31,20 @@ async fn smoke_test() {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
let image_layers = vec![(
Lsn(0x40),
vec![(
Key::from_hex("620000000033333333444444445500000000").unwrap(),
test_img("foo"),
)],
)];
// Create a test timeline with one real layer, and one synthetic test layer. The synthetic
// one is only there so that we can GC the real one without leaving the timeline's metadata
// empty, which is an illegal state (see [`IndexPart::validate`]).
let timeline = tenant
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
&ctx,
Default::default(),
image_layers,
Lsn(0x100),
)
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
// Grab one of the timeline's layers to exercise in the test, and the other layer that is just
// there to avoid the timeline being illegally empty
let (layer, dummy_layer) = {
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};
assert_eq!(layers.len(), 2);
assert_eq!(layers.len(), 1);
layers.sort_by_key(|l| l.layer_desc().get_key_range().start);
let synthetic_layer = layers.pop().unwrap();
let real_layer = layers.pop().unwrap();
tracing::info!(
"real_layer={:?} ({}), synthetic_layer={:?} ({})",
real_layer,
real_layer.layer_desc().file_size,
synthetic_layer,
synthetic_layer.layer_desc().file_size
);
(real_layer, synthetic_layer)
layers.swap_remove(0)
};
// all layers created at pageserver are like `layer`, initialized with strong
@@ -207,13 +173,10 @@ async fn smoke_test() {
let rtc = &timeline.remote_client;
// Simulate GC removing our test layer.
{
let mut g = timeline.layers.write().await;
let layers = &[layer];
let mut g = timeline.layers.write().await;
g.open_mut().unwrap().finish_gc_timeline(layers);
// this just updates the remote_physical_size for demonstration purposes
rtc.schedule_gc_update(layers).unwrap();
}
@@ -228,10 +191,7 @@ async fn smoke_test() {
rtc.wait_completion().await.unwrap();
assert_eq!(
rtc.get_remote_physical_size(),
dummy_layer.metadata().file_size
);
assert_eq!(rtc.get_remote_physical_size(), 0);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}

View File

@@ -14,7 +14,7 @@ pub mod uninit;
mod walreceiver;
use anyhow::{anyhow, bail, ensure, Context, Result};
use arc_swap::{ArcSwap, ArcSwapOption};
use arc_swap::ArcSwap;
use bytes::Bytes;
use camino::Utf8Path;
use chrono::{DateTime, Utc};
@@ -23,7 +23,6 @@ use fail::fail_point;
use handle::ShardTimelineId;
use offload::OffloadError;
use once_cell::sync::Lazy;
use pageserver_api::models::PageTraceEvent;
use pageserver_api::{
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
key::{
@@ -43,7 +42,6 @@ use rand::Rng;
use remote_storage::DownloadError;
use serde_with::serde_as;
use storage_broker::BrokerClientChannel;
use tokio::sync::mpsc::Sender;
use tokio::{
runtime::Handle,
sync::{oneshot, watch},
@@ -51,9 +49,7 @@ use tokio::{
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{
fs_ext,
guard_arc_swap::GuardArcSwap,
pausable_failpoint,
fs_ext, pausable_failpoint,
postgres_client::PostgresClientProtocol,
sync::gate::{Gate, GateGuard},
};
@@ -76,7 +72,6 @@ use std::{pin::pin, sync::OnceLock};
use crate::{
aux_file::AuxFileSizeEstimator,
page_service::TenantManagerTypes,
tenant::{
config::AttachmentMode,
layer_map::{LayerMap, SearchResult},
@@ -356,8 +351,8 @@ pub struct Timeline {
// though let's keep them both for better error visibility.
pub initdb_lsn: Lsn,
/// The repartitioning result. Allows a single writer and multiple readers.
pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
/// When did we last calculate the partitioning? Make it pub to test cases.
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
@@ -432,15 +427,12 @@ pub struct Timeline {
pub(crate) l0_flush_global_state: L0FlushGlobalState,
pub(crate) handles: handle::PerTimelineState<TenantManagerTypes>,
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
/// If Some, collects GetPage metadata for an ongoing PageTrace.
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
}
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -2343,8 +2335,7 @@ impl Timeline {
// initial logical size is 0.
LogicalSize::empty_initial()
},
partitioning: GuardArcSwap::new((
partitioning: tokio::sync::Mutex::new((
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
Lsn(0),
)),
@@ -2389,8 +2380,6 @@ impl Timeline {
attach_wal_lag_cooldown,
create_idempotency,
page_trace: Default::default(),
};
result.repartition_threshold =
@@ -4032,15 +4021,18 @@ impl Timeline {
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
let Ok(mut guard) = self.partitioning.try_write_guard() else {
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
// and hence before the compaction task starts.
// Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
// gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
// heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
return Err(CompactionError::Other(anyhow!(
"repartition() called concurrently"
"repartition() called concurrently, this is rare and a retry should be fine"
)));
};
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
if lsn < *partition_lsn {
return Err(CompactionError::Other(anyhow!(
"repartition() called with LSN going backwards, this should not happen"
@@ -4068,9 +4060,9 @@ impl Timeline {
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
}; // no partitioning for metadata keys for now
let result = ((dense_partitioning, sparse_partitioning), lsn);
guard.write(result.clone());
Ok(result)
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
}
// Is it time to create a new image layer for the given partition?
@@ -4626,10 +4618,6 @@ impl Drop for Timeline {
}
}
}
info!(
"Timeline {} for tenant {} is being dropped",
self.timeline_id, self.tenant_shard_id.tenant_id
);
}
}
@@ -5678,17 +5666,9 @@ impl Timeline {
info!("force created image layer {}", image_layer.local_path());
{
let mut guard = self.layers.write().await;
guard
.open_mut()
.unwrap()
.force_insert_layer(image_layer.clone());
guard.open_mut().unwrap().force_insert_layer(image_layer);
}
// Update remote_timeline_client state to reflect existence of this layer
self.remote_client
.schedule_layer_file_upload(image_layer)
.unwrap();
Ok(())
}
@@ -5739,17 +5719,9 @@ impl Timeline {
info!("force created delta layer {}", delta_layer.local_path());
{
let mut guard = self.layers.write().await;
guard
.open_mut()
.unwrap()
.force_insert_layer(delta_layer.clone());
guard.open_mut().unwrap().force_insert_layer(delta_layer);
}
// Update remote_timeline_client state to reflect existence of this layer
self.remote_client
.schedule_layer_file_upload(delta_layer)
.unwrap();
Ok(())
}

View File

@@ -1776,10 +1776,7 @@ impl Timeline {
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
) -> anyhow::Result<KeyHistoryRetention> {
// Pre-checks for the invariants
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
if debug_mode {
if cfg!(debug_assertions) {
for (log_key, _, _) in full_history {
assert_eq!(log_key, &key, "mismatched key");
}
@@ -1925,19 +1922,15 @@ impl Timeline {
output
}
let mut key_exists = false;
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
// TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
records_since_last_image += split_for_lsn.len();
// Whether to produce an image into the final layer files
let produce_image = if i == 0 && !has_ancestor {
let generate_image = if i == 0 && !has_ancestor {
// We always generate images for the first batch (below horizon / lowest retain_lsn)
true
} else if i == batch_cnt - 1 {
// Do not generate images for the last batch (above horizon)
false
} else if records_since_last_image == 0 {
false
} else if records_since_last_image >= delta_threshold_cnt {
// Generate images when there are too many records
true
@@ -1952,45 +1945,29 @@ impl Timeline {
break;
}
}
if replay_history.is_empty() && !key_exists {
// The key does not exist at earlier LSN, we can skip this iteration.
retention.push(Vec::new());
continue;
} else {
key_exists = true;
if let Some((_, _, val)) = replay_history.first() {
if !val.will_init() {
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
|| {
generate_debug_trace(
Some(&replay_history),
full_history,
retain_lsn_below_horizon,
horizon,
)
},
);
}
}
let Some((_, _, val)) = replay_history.first() else {
unreachable!("replay history should not be empty once it exists")
};
if !val.will_init() {
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| {
generate_debug_trace(
Some(&replay_history),
full_history,
retain_lsn_below_horizon,
horizon,
)
});
}
// Whether to reconstruct the image. In debug mode, we will generate an image
// at every retain_lsn to ensure data is not corrupted, but we won't put the
// image into the final layer.
let generate_image = produce_image || debug_mode;
if produce_image {
if generate_image && records_since_last_image > 0 {
records_since_last_image = 0;
}
let img_and_lsn = if generate_image {
let replay_history_for_debug = if debug_mode {
let replay_history_for_debug = if cfg!(debug_assertions) {
Some(replay_history.clone())
} else {
None
};
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
let history = if produce_image {
std::mem::take(&mut replay_history)
} else {
replay_history.clone()
};
let history = std::mem::take(&mut replay_history);
let mut img = None;
let mut records = Vec::with_capacity(history.len());
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2027,20 +2004,8 @@ impl Timeline {
}
records.reverse();
let state = ValueReconstructState { img, records };
// last batch does not generate image so i is always in range, unless we force generate
// an image during testing
let request_lsn = if i >= lsn_split_points.len() {
Lsn::MAX
} else {
lsn_split_points[i]
};
let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
let img = self.reconstruct_value(key, request_lsn, state).await?;
Some((request_lsn, img))
} else {
None
};
if produce_image {
let (request_lsn, img) = img_and_lsn.unwrap();
replay_history.push((key, request_lsn, Value::Image(img.clone())));
retention.push(vec![(request_lsn, Value::Image(img))]);
} else {
@@ -2146,7 +2111,12 @@ impl Timeline {
let mut compact_jobs = Vec::new();
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
let ((dense_ks, sparse_ks), _) = {
let Ok(partition) = self.partitioning.try_lock() else {
bail!("failed to acquire partition lock during gc-compaction");
};
partition.clone()
};
// Truncate the key range to be within user specified compaction range.
fn truncate_to(
source_start: &Key,
@@ -2303,8 +2273,6 @@ impl Timeline {
let compact_key_range = job.compact_key_range;
let compact_lsn_range = job.compact_lsn_range;
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
scopeguard::defer! {
@@ -2430,7 +2398,7 @@ impl Timeline {
.first()
.copied()
.unwrap_or(job_desc.gc_cutoff);
if debug_mode {
if cfg!(debug_assertions) {
assert_eq!(
res,
job_desc

View File

@@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory(
}
/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`make_timeline_delete_guard`]
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
async fn remove_maybe_offloaded_timeline_from_tenant(
tenant: &Tenant,
timeline: &TimelineOrOffloaded,
@@ -193,8 +193,10 @@ impl DeleteTimelineFlow {
) -> Result<(), DeleteTimelineError> {
super::debug_assert_current_span_has_tenant_and_timeline_id();
let allow_offloaded_children = false;
let set_stopping = true;
let (timeline, mut guard) =
make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?;
Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
guard.mark_in_progress()?;
@@ -331,6 +333,75 @@ impl DeleteTimelineFlow {
Ok(())
}
pub(super) fn prepare(
tenant: &Tenant,
timeline_id: TimelineId,
allow_offloaded_children: bool,
set_stopping: bool,
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
// Note the interaction between this guard and deletion guard.
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
// This is important because when you take into account `remove_timeline_from_tenant`
// we remove timeline from memory when we still hold the deletion guard.
// So here when timeline deletion is finished timeline wont be present in timelines map at all
// which makes the following sequence impossible:
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
let timelines = tenant.timelines.lock().unwrap();
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
let timeline = match timelines.get(&timeline_id) {
Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
None => match timelines_offloaded.get(&timeline_id) {
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
None => return Err(DeleteTimelineError::NotFound),
},
};
// Ensure that there are no child timelines, because we are about to remove files,
// which will break child branches
let mut children = Vec::new();
if !allow_offloaded_children {
children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
(entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
}));
}
children.extend(timelines.iter().filter_map(|(id, entry)| {
(entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
}));
if !children.is_empty() {
return Err(DeleteTimelineError::HasChildren(children));
}
// Note that using try_lock here is important to avoid a deadlock.
// Here we take lock on timelines and then the deletion guard.
// At the end of the operation we're holding the guard and need to lock timelines map
// to remove the timeline from it.
// Always if you have two locks that are taken in different order this can result in a deadlock.
let delete_progress = Arc::clone(timeline.delete_progress());
let delete_lock_guard = match delete_progress.try_lock_owned() {
Ok(guard) => DeletionGuard(guard),
Err(_) => {
// Unfortunately if lock fails arc is consumed.
return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
timeline.delete_progress(),
)));
}
};
if set_stopping {
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
timeline.set_state(TimelineState::Stopping);
}
}
Ok((timeline, delete_lock_guard))
}
fn schedule_background(
guard: DeletionGuard,
conf: &'static PageServerConf,
@@ -412,80 +483,6 @@ impl DeleteTimelineFlow {
}
}
#[derive(Copy, Clone, PartialEq, Eq)]
pub(super) enum TimelineDeleteGuardKind {
Offload,
Delete,
}
pub(super) fn make_timeline_delete_guard(
tenant: &Tenant,
timeline_id: TimelineId,
guard_kind: TimelineDeleteGuardKind,
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
// Note the interaction between this guard and deletion guard.
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
// This is important because when you take into account `remove_timeline_from_tenant`
// we remove timeline from memory when we still hold the deletion guard.
// So here when timeline deletion is finished timeline wont be present in timelines map at all
// which makes the following sequence impossible:
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
let timelines = tenant.timelines.lock().unwrap();
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
let timeline = match timelines.get(&timeline_id) {
Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
None => match timelines_offloaded.get(&timeline_id) {
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
None => return Err(DeleteTimelineError::NotFound),
},
};
// Ensure that there are no child timelines, because we are about to remove files,
// which will break child branches
let mut children = Vec::new();
if guard_kind == TimelineDeleteGuardKind::Delete {
children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
(entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
}));
}
children.extend(timelines.iter().filter_map(|(id, entry)| {
(entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
}));
if !children.is_empty() {
return Err(DeleteTimelineError::HasChildren(children));
}
// Note that using try_lock here is important to avoid a deadlock.
// Here we take lock on timelines and then the deletion guard.
// At the end of the operation we're holding the guard and need to lock timelines map
// to remove the timeline from it.
// Always if you have two locks that are taken in different order this can result in a deadlock.
let delete_progress = Arc::clone(timeline.delete_progress());
let delete_lock_guard = match delete_progress.try_lock_owned() {
Ok(guard) => DeletionGuard(guard),
Err(_) => {
// Unfortunately if lock fails arc is consumed.
return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
timeline.delete_progress(),
)));
}
};
if guard_kind == TimelineDeleteGuardKind::Delete {
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
timeline.set_state(TimelineState::Stopping);
}
}
Ok((timeline, delete_lock_guard))
}
pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
impl Deref for DeletionGuard {

View File

@@ -32,151 +32,54 @@
//!
//! # Design
//!
//! ## Data Structures
//!
//! There are three user-facing data structures:
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
//!
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
//!
//! The `HandleInner` is allocated as a `Arc<Mutex<HandleInner>>` and
//! referenced weakly and strongly from various places which we are now illustrating.
//! For brevity, we will omit the `Arc<Mutex<>>` part in the following and instead
//! use `strong ref` and `weak ref` when referring to the `Arc<Mutex<HandleInner>>`
//! or `Weak<Mutex<HandleInner>>`, respectively.
//!
//! - The `Handle` is a strong ref.
//! - The `WeakHandle` is a weak ref.
//! - The `PerTimelineState` contains a `HashMap<CacheId, strong ref>`.
//! - The `Cache` is a `HashMap<unique identifier for the shard, weak ref>`.
//!
//! Lifetimes:
//! - `WeakHandle` and `Handle`: single pagestream request.
//! - `Cache`: single page service connection.
//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`).
//!
//! ## Request Handling Flow (= filling and using the `Cache``)
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
//!
//! To dispatch a request, the page service connection calls `Cache::get`.
//!
//! A cache miss means we consult the tenant manager for shard routing,
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
//! and a strong ref in the `PerTimelineState`.
//! A strong ref is returned wrapped in a `Handle`.
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
//!
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
//! and find the weak ref in the cache.
//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`.
//! and find the `Weak<HandleInner>` in the cache.
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
//!
//! The pagestream processing is pipelined and involves a batching step.
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
//!
//! # Performance
//! # Memory Management / How The Reference Cycle Is Broken
//!
//! Remember from the introductory section:
//! The attentive reader may have noticed the strong reference cycle
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
//!
//! > However, we want to avoid the overhead of entering the gate for every
//! > method invocation.
//!
//! Why do we want to avoid that?
//! Because the gate is a shared location in memory and entering it involves
//! bumping refcounts, which leads to cache contention if done frequently
//! from multiple cores in parallel.
//!
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
//! That `Arc` is private to the `HandleInner` and hence to the connection.
//! (Review the "Data Structures" section if that is unclear to you.)
//!
//! A `WeakHandle` is a weak ref to the `HandleInner`.
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
//!
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
//! Again, this is cheap because the `Arc` is private to the connection.
//!
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
//!
//! # Shutdown
//!
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
//!
//! ```text
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
//! ```
//!
//! Further, there is this cycle:
//!
//! ```text
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
//! ```
//!
//! The former cycle is a memory leak if not broken.
//! The latter cycle further prevents the Timeline from shutting down
//! because we certainly won't drop the Timeline while the GateGuard is alive.
//! Preventing shutdown is the whole point of this handle/cache system,
//! but when the Timeline needs to shut down, we need to break the cycle.
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
//!
//! The cycle is broken by either
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
//! - Connection shutdown (=> dropping the `Cache`).
//! - `PerTimelineState::shutdown` or
//! - dropping the `Cache`.
//!
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
//! `Arc<GateGuard>`.
//!
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
//! thereby breaking the cycle.
//! It also initiates draining of already existing `Handle`s by
//! poisoning things so that no new `HandleInner`'s can be added
//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail.
//!
//! Concurrently existing / already upgraded `Handle`s will extend the
//! lifetime of the `Arc<Mutex<HandleInner>>` and hence cycles.
//! Concurrently existing `Handle`s will extend the existence of the cycle.
//! However, since `Handle`s are short-lived and new `Handle`s are not
//! handed out from `Cache::get` or `WeakHandle::upgrade` after
//! `PerTimelineState::shutdown`, that extension of the cycle is bounded.
//!
//! Concurrently existing `WeakHandle`s will fail to `upgrade()`:
//! while they will succeed in upgrading `Weak<Mutex<HandleInner>>`,
//! they will find the inner in state `HandleInner::ShutDown` state where the
//! `Arc<GateGuard>` and Timeline has already been dropped.
//!
//! Dropping the `Cache` undoes the registration of this `Cache`'s
//! `HandleInner`s from all the `PerTimelineState`s, i.e., it
//! removes the strong ref to each of its `HandleInner`s
//! from all the `PerTimelineState`.
//!
//! # Locking Rules
//!
//! To prevent deadlocks we:
//!
//! 1. Only ever hold one of the locks at a time.
//! 2. Don't add more than one Drop impl that locks on the
//! cycles above.
//!
//! As per (2), that impl is in `Drop for Cache`.
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
//! that extension of the cycle is bounded.
//!
//! # Fast Path for Shard Routing
//!
//! The `Cache` has a fast path for shard routing to avoid calling into
//! the tenant manager for every request.
//!
//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s.
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
//!
//! The current implementation uses the first entry in the hash map
//! to determine the `ShardParameters` and derive the correct
@@ -184,18 +87,18 @@
//!
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
//!
//! If the lookup is successful and the `WeakHandle` can be upgraded,
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
//! it's a hit.
//!
//! ## Cache invalidation
//!
//! The insight is that cache invalidation is sufficient and most efficiently if done lazily.
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
//! The only reasons why an entry in the cache can become stale are:
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
//! being detached, timeline or shard deleted, or pageserver is shutting down.
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
//!
//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
//! timeline has shut down, and when that happens, we remove the entry from the cache.
//!
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
@@ -204,6 +107,8 @@
use std::collections::hash_map;
use std::collections::HashMap;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::Weak;
@@ -247,7 +152,7 @@ pub(crate) struct Cache<T: Types> {
map: Map<T>,
}
type Map<T> = HashMap<ShardTimelineId, WeakHandle<T>>;
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
impl<T: Types> Default for Cache<T> {
fn default() -> Self {
@@ -265,22 +170,12 @@ pub(crate) struct ShardTimelineId {
}
/// See module-level comment.
pub(crate) struct Handle<T: Types> {
timeline: Arc<T::Timeline>,
#[allow(dead_code)] // the field exists to keep the gate open
gate_guard: Arc<utils::sync::gate::GateGuard>,
inner: Arc<Mutex<HandleInner<T>>>,
}
pub(crate) struct WeakHandle<T: Types> {
inner: Weak<Mutex<HandleInner<T>>>,
}
enum HandleInner<T: Types> {
KeepingTimelineGateOpen {
#[allow(dead_code)]
gate_guard: Arc<utils::sync::gate::GateGuard>,
timeline: Arc<T::Timeline>,
},
ShutDown,
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
struct HandleInner<T: Types> {
shut_down: AtomicBool,
timeline: T::Timeline,
// The timeline's gate held open.
_gate_guard: utils::sync::gate::GateGuard,
}
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
@@ -288,8 +183,7 @@ enum HandleInner<T: Types> {
/// See module-level comment for details.
pub struct PerTimelineState<T: Types> {
// None = shutting down
#[allow(clippy::type_complexity)]
handles: Mutex<Option<HashMap<CacheId, Arc<Mutex<HandleInner<T>>>>>>,
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
}
impl<T: Types> Default for PerTimelineState<T> {
@@ -349,24 +243,49 @@ impl<T: Types> Cache<T> {
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
// terminates because when every iteration we remove an element from the map
let miss: ShardSelector = loop {
// terminates because each iteration removes an element from the map
loop {
let handle = self
.get_impl(timeline_id, shard_selector, tenant_manager)
.await?;
if handle.0.shut_down.load(Ordering::Relaxed) {
let removed = self
.map
.remove(&handle.0.timeline.shard_timeline_id())
.expect("invariant of get_impl is that the returned handle is in the map");
assert!(
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
"shard_timeline_id() incorrect?"
);
} else {
return Ok(handle);
}
}
}
#[instrument(level = "trace", skip_all)]
async fn get_impl(
&mut self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
tenant_manager: &T::TenantManager,
) -> Result<Handle<T>, GetError<T>> {
let miss: ShardSelector = {
let routing_state = self.shard_routing(timeline_id, shard_selector);
match routing_state {
RoutingResult::FastPath(handle) => return Ok(handle),
RoutingResult::SlowPath(key) => match self.map.get(&key) {
Some(cached) => match cached.upgrade() {
Ok(upgraded) => return Ok(upgraded),
Err(HandleUpgradeError::ShutDown) => {
// TODO: dedup with shard_routing()
Some(upgraded) => return Ok(Handle(upgraded)),
None => {
trace!("handle cache stale");
self.map.remove(&key).unwrap();
continue;
ShardSelector::Known(key.shard_index)
}
},
None => break ShardSelector::Known(key.shard_index),
None => ShardSelector::Known(key.shard_index),
},
RoutingResult::NeedConsultTenantManager => break shard_selector,
RoutingResult::NeedConsultTenantManager => shard_selector,
}
};
self.get_miss(timeline_id, miss, tenant_manager).await
@@ -383,7 +302,7 @@ impl<T: Types> Cache<T> {
let Some((first_key, first_handle)) = self.map.iter().next() else {
return RoutingResult::NeedConsultTenantManager;
};
let Ok(first_handle) = first_handle.upgrade() else {
let Some(first_handle) = first_handle.upgrade() else {
// TODO: dedup with get()
trace!("handle cache stale");
let first_key_owned = *first_key;
@@ -391,7 +310,7 @@ impl<T: Types> Cache<T> {
continue;
};
let first_handle_shard_identity = first_handle.get_shard_identity();
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
shard_number: shard_num,
shard_count: first_handle_shard_identity.count,
@@ -410,11 +329,11 @@ impl<T: Types> Cache<T> {
};
let first_handle_shard_timeline_id = ShardTimelineId {
shard_index: first_handle_shard_identity.shard_index(),
timeline_id: first_handle.shard_timeline_id().timeline_id,
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
};
if need_shard_timeline_id == first_handle_shard_timeline_id {
return RoutingResult::FastPath(first_handle);
return RoutingResult::FastPath(Handle(first_handle));
} else {
return RoutingResult::SlowPath(need_shard_timeline_id);
}
@@ -438,30 +357,23 @@ impl<T: Types> Cache<T> {
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
}
trace!("creating new HandleInner");
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
gate_guard: Arc::new(
// this enter() is expensive in production code because
// it hits the global Arc<Timeline>::gate refcounts
match timeline.gate().enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetError::TimelineGateClosed);
}
},
),
// this clone is expensive in production code because
// it hits the global Arc<Timeline>::clone refcounts
timeline: Arc::new(timeline.clone()),
}));
let handle_weak = WeakHandle {
inner: Arc::downgrade(&handle_inner_arc),
let gate_guard = match timeline.gate().enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetError::TimelineGateClosed);
}
};
let handle = handle_weak
.upgrade()
.ok()
.expect("we just created it and it's not linked anywhere yet");
{
trace!("creating new HandleInner");
let handle = Arc::new(
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
// so we can identify reference cycle bugs.
HandleInner {
shut_down: AtomicBool::new(false),
_gate_guard: gate_guard,
timeline: timeline.clone(),
},
);
let handle = {
let mut lock_guard = timeline
.per_timeline_state()
.handles
@@ -469,8 +381,7 @@ impl<T: Types> Cache<T> {
.expect("mutex poisoned");
match &mut *lock_guard {
Some(per_timeline_state) => {
let replaced =
per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
assert!(replaced.is_none(), "some earlier code left a stale handle");
match self.map.entry(key) {
hash_map::Entry::Occupied(_o) => {
@@ -481,7 +392,8 @@ impl<T: Types> Cache<T> {
unreachable!()
}
hash_map::Entry::Vacant(v) => {
v.insert(handle_weak);
v.insert(Arc::downgrade(&handle));
handle
}
}
}
@@ -489,62 +401,14 @@ impl<T: Types> Cache<T> {
return Err(GetError::PerTimelineStateShutDown);
}
}
}
Ok(handle)
};
Ok(Handle(handle))
}
Err(e) => Err(GetError::TenantManager(e)),
}
}
}
pub(crate) enum HandleUpgradeError {
ShutDown,
}
impl<T: Types> WeakHandle<T> {
pub(crate) fn upgrade(&self) -> Result<Handle<T>, HandleUpgradeError> {
let Some(inner) = Weak::upgrade(&self.inner) else {
return Err(HandleUpgradeError::ShutDown);
};
let lock_guard = inner.lock().expect("poisoned");
match &*lock_guard {
HandleInner::KeepingTimelineGateOpen {
timeline,
gate_guard,
} => {
let gate_guard = Arc::clone(gate_guard);
let timeline = Arc::clone(timeline);
drop(lock_guard);
Ok(Handle {
timeline,
gate_guard,
inner,
})
}
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
}
}
pub(crate) fn is_same_handle_as(&self, other: &WeakHandle<T>) -> bool {
Weak::ptr_eq(&self.inner, &other.inner)
}
}
impl<T: Types> std::ops::Deref for Handle<T> {
type Target = T::Timeline;
fn deref(&self) -> &Self::Target {
&self.timeline
}
}
impl<T: Types> Handle<T> {
pub(crate) fn downgrade(&self) -> WeakHandle<T> {
WeakHandle {
inner: Arc::downgrade(&self.inner),
}
}
}
impl<T: Types> PerTimelineState<T> {
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
/// to the [`Types::Timeline`] that embeds this per-timeline state.
@@ -566,62 +430,43 @@ impl<T: Types> PerTimelineState<T> {
trace!("already shut down");
return;
};
for handle_inner_arc in handles.values() {
for handle in handles.values() {
// Make hits fail.
let mut lock_guard = handle_inner_arc.lock().expect("poisoned");
lock_guard.shutdown();
handle.shut_down.store(true, Ordering::Relaxed);
}
drop(handles);
}
}
impl<T: Types> std::ops::Deref for Handle<T> {
type Target = T::Timeline;
fn deref(&self) -> &Self::Target {
&self.0.timeline
}
}
#[cfg(test)]
impl<T: Types> Drop for HandleInner<T> {
fn drop(&mut self) {
trace!("HandleInner dropped");
}
}
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
impl<T: Types> Drop for Cache<T> {
fn drop(&mut self) {
for (
_,
WeakHandle {
inner: handle_inner_weak,
},
) in self.map.drain()
{
let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
continue;
};
let Some(handle_timeline) = handle_inner_arc
// locking rules: drop lock before acquiring other lock below
.lock()
.expect("poisoned")
.shutdown()
else {
// Concurrent PerTimelineState::shutdown.
continue;
};
// Clean up per_timeline_state so the HandleInner allocation can be dropped.
let per_timeline_state = handle_timeline.per_timeline_state();
let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
let Some(handles) = &mut *handles_lock_guard else {
continue;
};
let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
// Concurrent PerTimelineState::shutdown.
continue;
};
drop(handles_lock_guard); // locking rules!
assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc));
}
}
}
impl<T: Types> HandleInner<T> {
fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
match std::mem::replace(self, HandleInner::ShutDown) {
HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
HandleInner::ShutDown => {
// Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
// may do it concurrently, but locking rules disallow holding per-timeline-state lock and
// the handle lock at the same time.
None
for (_, weak) in self.map.drain() {
if let Some(strong) = weak.upgrade() {
// handle is still being kept alive in PerTimelineState
let timeline = strong.timeline.per_timeline_state();
let mut handles = timeline.handles.lock().expect("mutex poisoned");
if let Some(handles) = &mut *handles {
let Some(removed) = handles.remove(&self.id) else {
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
continue;
};
assert!(Arc::ptr_eq(&removed, &strong));
}
}
}
}
@@ -629,8 +474,6 @@ impl<T: Types> HandleInner<T> {
#[cfg(test)]
mod tests {
use std::sync::Weak;
use pageserver_api::{
key::{rel_block_to_key, Key, DBDIR_KEY},
models::ShardParameters,
@@ -740,13 +583,39 @@ mod tests {
//
// fill the cache
//
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(2, 1),
"strong: shard0, mgr; weak: myself"
);
let handle: Handle<_> = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
let handle_inner_weak = Arc::downgrade(&handle.0);
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
assert_eq!(
(
Weak::strong_count(&handle_inner_weak),
Weak::weak_count(&handle_inner_weak)
),
(2, 2),
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
);
assert_eq!(cache.map.len(), 1);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
);
drop(handle);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
);
//
// demonstrate that Handle holds up gate closure
@@ -771,11 +640,21 @@ mod tests {
// SHUTDOWN
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
assert_eq!(
1,
Weak::strong_count(&handle_inner_weak),
"through local var handle"
);
assert_eq!(
cache.map.len(),
1,
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(3, 1),
"strong: handleinner(via handle), shard0, mgr; weak: myself"
);
// this handle is perfectly usable
handle.getpage();
@@ -799,6 +678,16 @@ mod tests {
}
drop(handle);
assert_eq!(
0,
Weak::strong_count(&handle_inner_weak),
"the HandleInner destructor already ran"
);
assert_eq!(
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
(2, 1),
"strong: shard0, mgr; weak: myself"
);
// closing gate succeeds after dropping handle
tokio::select! {
@@ -817,8 +706,10 @@ mod tests {
assert_eq!(cache.map.len(), 0);
// ensure all refs to shard0 are gone and we're not leaking anything
let myself = Weak::clone(&shard0.myself);
drop(shard0);
drop(mgr);
assert_eq!(Weak::strong_count(&myself), 0);
}
#[tokio::test]
@@ -1057,11 +948,15 @@ mod tests {
handle
};
handle.getpage();
used_handles.push(Arc::downgrade(&handle.timeline));
used_handles.push(Arc::downgrade(&handle.0));
}
// No handles exist, thus gates are closed and don't require shutdown.
// Thus the gate should close immediately, even without shutdown.
// No handles exist, thus gates are closed and don't require shutdown
assert!(used_handles
.iter()
.all(|weak| Weak::strong_count(weak) == 0));
// ... thus the gate should close immediately, even without shutdown
tokio::select! {
_ = shard0.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
@@ -1069,172 +964,4 @@ mod tests {
}
}
}
#[tokio::test(start_paused = true)]
async fn test_weak_handles() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mgr = StubManager {
shards: vec![shard0.clone()],
};
let refcount_start = Arc::strong_count(&shard0);
let key = DBDIR_KEY;
let mut cache = Cache::<TestTypes>::default();
let handle = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
let weak_handle = handle.downgrade();
drop(handle);
let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it");
// Start shutdown
shard0.per_timeline_state.shutdown();
// Upgrades during shutdown don't work, even if upgraded_handle exists.
weak_handle
.upgrade()
.err()
.expect("can't upgrade weak handle as soon as shutdown started");
// But upgraded_handle is still alive, so the gate won't close.
tokio::select! {
_ = shard0.gate.close() => {
panic!("handle is keeping gate open");
}
_ = tokio::time::sleep(FOREVER) => { }
}
// Drop the last handle.
drop(upgraded_handle);
// The gate should close now, despite there still being a weak_handle.
tokio::select! {
_ = shard0.gate.close() => { }
_ = tokio::time::sleep(FOREVER) => {
panic!("only strong handle is dropped and we shut down per-timeline-state")
}
}
// The weak handle still can't be upgraded.
weak_handle
.upgrade()
.err()
.expect("still shouldn't be able to upgrade the weak handle");
// There should be no strong references to the timeline object except the one on "stack".
assert_eq!(Arc::strong_count(&shard0), refcount_start);
}
#[tokio::test(start_paused = true)]
async fn test_reference_cycle_broken_when_cache_is_dropped() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mgr = StubManager {
shards: vec![shard0.clone()],
};
let key = DBDIR_KEY;
let mut cache = Cache::<TestTypes>::default();
// helper to check if a handle is referenced by per_timeline_state
let per_timeline_state_refs_handle = |handle_weak: &Weak<Mutex<HandleInner<_>>>| {
let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap();
let per_timeline_state = per_timeline_state.as_ref().unwrap();
per_timeline_state
.values()
.any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak))
};
// Fill the cache.
let handle = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
let handle_inner_weak = Arc::downgrade(&handle.inner);
assert!(
per_timeline_state_refs_handle(&handle_inner_weak),
"we still hold `handle` _and_ haven't dropped `cache` yet"
);
// Drop the cache.
drop(cache);
assert!(
!(per_timeline_state_refs_handle(&handle_inner_weak)),
"nothing should reference the handle allocation anymore"
);
assert!(
Weak::upgrade(&handle_inner_weak).is_some(),
"the local `handle` still keeps the allocation alive"
);
// but obviously the cache is gone so no new allocations can be handed out.
// Drop handle.
drop(handle);
assert!(
Weak::upgrade(&handle_inner_weak).is_none(),
"the local `handle` is dropped, so the allocation should be dropped by now"
);
}
#[tokio::test(start_paused = true)]
async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() {
crate::tenant::harness::setup_logging();
let timeline_id = TimelineId::generate();
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::unsharded(),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let mgr = StubManager {
shards: vec![shard0.clone()],
};
let key = DBDIR_KEY;
let mut cache = Cache::<TestTypes>::default();
let handle = cache
.get(timeline_id, ShardSelector::Page(key), &mgr)
.await
.expect("we have the timeline");
// grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails
let handle_inner_weak = Arc::downgrade(&handle.inner);
// drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it
drop(handle);
assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still");
// Shutdown the per_timeline_state.
shard0.per_timeline_state.shutdown();
assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer");
// cache only contains Weak's, so, it can outlive the per_timeline_state without
// Drop explicitly solely to make this point.
drop(cache);
}
}

View File

@@ -2,11 +2,10 @@ use std::sync::Arc;
use pageserver_api::models::{TenantState, TimelineState};
use super::delete::{delete_local_timeline_directory, DeletionGuard};
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
use super::Timeline;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
#[derive(thiserror::Error, Debug)]
@@ -37,10 +36,13 @@ pub(crate) async fn offload_timeline(
debug_assert_current_span_has_tenant_and_timeline_id();
tracing::info!("offloading archived timeline");
let (timeline, guard) = make_timeline_delete_guard(
let allow_offloaded_children = true;
let set_stopping = false;
let (timeline, guard) = DeleteTimelineFlow::prepare(
tenant,
timeline.timeline_id,
TimelineDeleteGuardKind::Offload,
allow_offloaded_children,
set_stopping,
)
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
@@ -104,7 +106,7 @@ pub(crate) async fn offload_timeline(
}
/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`make_timeline_delete_guard`]
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
///
/// Returns the strong count of the timeline `Arc`
fn remove_timeline_from_tenant(

View File

@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
let (replication_client, connection) = {
let mut config = wal_source_connconf.to_tokio_postgres_config();
config.application_name(format!("pageserver-{}", node.0).as_str());
config.application_name("pageserver");
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
Ok(client_and_conn) => client_and_conn?,

View File

@@ -911,85 +911,57 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
if (entry->access_count++ == 0)
dlist_delete(&entry->list_node);
}
/*-----------
* If the chunk wasn't already in the LFC then we have these
* options, in order of preference:
*
* Unless there is no space available, we can:
* 1. Use an entry from the `holes` list, and
* 2. Create a new entry.
* We can always, regardless of space in the LFC:
* 3. evict an entry from LRU, and
* 4. ignore the write operation (the least favorite option)
*/
else if (lfc_ctl->used < lfc_ctl->limit)
else
{
if (!dlist_is_empty(&lfc_ctl->holes))
/*
* We have two choices if all cache pages are pinned (i.e. used in IO
* operations):
*
* 1) Wait until some of this operation is completed and pages is
* unpinned.
*
* 2) Allocate one more chunk, so that specified cache size is more
* recommendation than hard limit.
*
* As far as probability of such event (that all pages are pinned) is
* considered to be very very small: there are should be very large
* number of concurrent IO operations and them are limited by
* max_connections, we prefer not to complicate code and use second
* approach.
*/
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
{
/* Cache overflow: evict least recently used chunk */
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
{
lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
}
CriticalAssert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
neon_log(DEBUG2, "Swap file cache page");
}
else if (!dlist_is_empty(&lfc_ctl->holes))
{
/* We can reuse a hole that was left behind when the LFC was shrunk previously */
FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
dlist_pop_head_node(&lfc_ctl->holes));
uint32 offset = hole->offset;
bool hole_found;
hash_search_with_hash_value(lfc_hash, &hole->key,
hole->hash, HASH_REMOVE, &hole_found);
FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
uint32 offset = hole->offset;
bool hole_found;
hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found);
CriticalAssert(hole_found);
lfc_ctl->used += 1;
entry->offset = offset; /* reuse the hole */
entry->offset = offset; /* reuse the hole */
}
else
{
lfc_ctl->used += 1;
entry->offset = lfc_ctl->size++;/* allocate new chunk at end
* of file */
entry->offset = lfc_ctl->size++; /* allocate new chunk at end
* of file */
}
}
/*
* We've already used up all allocated LFC entries.
*
* If we can clear an entry from the LRU, do that.
* If we can't (e.g. because all other slots are being accessed)
* then we will remove this entry from the hash and continue
* on to the next chunk, as we may not exceed the limit.
*/
else if (!dlist_is_empty(&lfc_ctl->lru))
{
/* Cache overflow: evict least recently used chunk */
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
dlist_pop_head_node(&lfc_ctl->lru));
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
{
lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
}
CriticalAssert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key,
victim->hash, HASH_REMOVE, NULL);
neon_log(DEBUG2, "Swap file cache page");
}
else
{
/* Can't add this chunk - we don't have the space for it */
hash_search_with_hash_value(lfc_hash, &entry->key, hash,
HASH_REMOVE, NULL);
/*
* We can't process this chunk due to lack of space in LFC,
* so skip to the next one
*/
LWLockRelease(lfc_lock);
blkno += blocks_in_chunk;
buf_offset += blocks_in_chunk;
nblocks -= blocks_in_chunk;
continue;
}
if (!found)
{
entry->access_count = 1;
entry->hash = hash;
memset(entry->bitmap, 0, sizeof entry->bitmap);

View File

@@ -911,74 +911,7 @@ pageserver_receive(shardno_t shard_no)
}
PG_CATCH();
{
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
pageserver_disconnect(shard_no);
PG_RE_THROW();
}
PG_END_TRY();
if (message_level_is_interesting(PageStoreTrace))
{
char *msg = nm_to_string((NeonMessage *) resp);
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
pfree(msg);
}
}
else if (rc == -1)
{
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
pageserver_disconnect(shard_no);
resp = NULL;
}
else if (rc == -2)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
}
else
{
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
}
shard->nresponses_received++;
return (NeonResponse *) resp;
}
static NeonResponse *
pageserver_try_receive(shardno_t shard_no)
{
StringInfoData resp_buff;
NeonResponse *resp;
PageServer *shard = &page_servers[shard_no];
PGconn *pageserver_conn = shard->conn;
/* read response */
int rc;
if (shard->state != PS_Connected)
return NULL;
Assert(pageserver_conn);
rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */);
if (rc == 0)
return NULL;
else if (rc > 0)
{
PG_TRY();
{
resp_buff.len = rc;
resp_buff.cursor = 0;
resp = nm_unpack_response(&resp_buff);
PQfreemem(resp_buff.data);
}
PG_CATCH();
{
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
pageserver_disconnect(shard_no);
PG_RE_THROW();
}
@@ -1047,7 +980,6 @@ page_server_api api =
.send = pageserver_send,
.flush = pageserver_flush,
.receive = pageserver_receive,
.try_receive = pageserver_try_receive,
.disconnect = pageserver_disconnect_shard
};

View File

@@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
return true;
}
/* --------------------------------
* pq_getmsgint16 - get a binary 2-byte int from a message buffer
* --------------------------------
*/
uint16
pq_getmsgint16(StringInfo msg)
{
return pq_getmsgint(msg, 2);
}
/* --------------------------------
* pq_getmsgint32 - get a binary 4-byte int from a message buffer
* --------------------------------
*/
uint32
pq_getmsgint32(StringInfo msg)
{
return pq_getmsgint(msg, 4);
}
/* --------------------------------
* pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order
* --------------------------------

View File

@@ -8,6 +8,8 @@
#endif
bool HexDecodeString(uint8 *result, char *input, int nbytes);
uint16 pq_getmsgint16(StringInfo msg);
uint32 pq_getmsgint32(StringInfo msg);
uint32 pq_getmsgint32_le(StringInfo msg);
uint64 pq_getmsgint64_le(StringInfo msg);
void pq_sendint32_le(StringInfo buf, uint32 i);

View File

@@ -34,8 +34,6 @@ typedef enum
T_NeonGetPageRequest,
T_NeonDbSizeRequest,
T_NeonGetSlruSegmentRequest,
/* future tags above this line */
T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */
/* pagestore -> pagestore_client */
T_NeonExistsResponse = 100,
@@ -44,8 +42,6 @@ typedef enum
T_NeonErrorResponse,
T_NeonDbSizeResponse,
T_NeonGetSlruSegmentResponse,
/* future tags above this line */
T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */
} NeonMessageTag;
typedef uint64 NeonRequestId;
@@ -196,29 +192,9 @@ typedef uint16 shardno_t;
typedef struct
{
/*
* Send this request to the PageServer associated with this shard.
*/
bool (*send) (shardno_t shard_no, NeonRequest * request);
/*
* Blocking read for the next response of this shard.
*
* When a CANCEL signal is handled, the connection state will be
* unmodified.
*/
NeonResponse *(*receive) (shardno_t shard_no);
/*
* Try get the next response from the TCP buffers, if any.
* Returns NULL when the data is not yet available.
*/
NeonResponse *(*try_receive) (shardno_t shard_no);
/*
* Make sure all requests are sent to PageServer.
*/
bool (*flush) (shardno_t shard_no);
/*
* Disconnect from this pageserver shard.
*/
void (*disconnect) (shardno_t shard_no);
} page_server_api;

View File

@@ -405,56 +405,6 @@ compact_prefetch_buffers(void)
return false;
}
/*
* If there might be responses still in the TCP buffer, then
* we should try to use those, so as to reduce any TCP backpressure
* on the OS/PS side.
*
* This procedure handles that.
*
* Note that this is only valid as long as the only pipelined
* operations in the TCP buffer are getPage@Lsn requests.
*/
static void
prefetch_pump_state(void)
{
while (MyPState->ring_receive != MyPState->ring_flush)
{
NeonResponse *response;
PrefetchRequest *slot;
MemoryContext old;
slot = GetPrfSlot(MyPState->ring_receive);
old = MemoryContextSwitchTo(MyPState->errctx);
response = page_server->try_receive(slot->shard_no);
MemoryContextSwitchTo(old);
if (response == NULL)
break;
/* The slot should still be valid */
if (slot->status != PRFS_REQUESTED ||
slot->response != NULL ||
slot->my_ring_index != MyPState->ring_receive)
neon_shard_log(slot->shard_no, ERROR,
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
slot->status, slot->response,
(long) slot->my_ring_index, (long) MyPState->ring_receive);
/* update prefetch state */
MyPState->n_responses_buffered += 1;
MyPState->n_requests_inflight -= 1;
MyPState->ring_receive += 1;
MyNeonCounters->getpage_prefetches_buffered =
MyPState->n_responses_buffered;
/* update slot state */
slot->status = PRFS_RECEIVED;
slot->response = response;
}
}
void
readahead_buffer_resize(int newsize, void *extra)
{
@@ -2858,8 +2808,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
MyPState->ring_last <= ring_index);
}
prefetch_pump_state();
return false;
}
@@ -2901,8 +2849,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Assert(ring_index < MyPState->ring_unused &&
MyPState->ring_last <= ring_index);
prefetch_pump_state();
return false;
}
#endif /* PG_MAJORVERSION_NUM < 17 */
@@ -2945,8 +2891,6 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
*/
neon_log(SmgrTrace, "writeback noop");
prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdwriteback(reln, forknum, blocknum, nblocks);
@@ -3201,8 +3145,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
{
@@ -3340,8 +3282,6 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read);
prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
{
@@ -3510,8 +3450,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
#if PG_MAJORVERSION_NUM >= 17
@@ -3565,8 +3503,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
@@ -3856,8 +3792,6 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
prefetch_pump_state();
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdimmedsync(reln, forknum);

View File

@@ -70,6 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
static bool RecvAppendResponses(Safekeeper *sk);
static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version);
static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
@@ -81,6 +82,8 @@ static char *FormatSafekeeperState(Safekeeper *sk);
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
static char *FormatEvents(WalProposer *wp, uint32 events);
static void UpdateDonorShmem(WalProposer *wp);
static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
static void MembershipConfigurationFree(MembershipConfiguration *mconf);
WalProposer *
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -137,25 +140,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
}
wp->quorum = wp->n_safekeepers / 2 + 1;
if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
/* Fill the greeting package */
wp->greetRequest.tag = 'g';
wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
wp->greetRequest.pgVersion = PG_VERSION_NUM;
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
wp->greetRequest.systemId = wp->config->systemId;
if (!wp->config->neon_timeline)
wp_log(FATAL, "neon.timeline_id is not provided");
if (*wp->config->neon_timeline != '\0' &&
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
wp->greetRequest.pam.tag = 'g';
if (!wp->config->neon_tenant)
wp_log(FATAL, "neon.tenant_id is not provided");
if (*wp->config->neon_tenant != '\0' &&
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
wp->greetRequest.timeline = wp->config->pgTimeline;
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
wp->greetRequest.tenant_id = wp->config->neon_tenant;
if (!wp->config->neon_timeline)
wp_log(FATAL, "neon.timeline_id is not provided");
wp->greetRequest.timeline_id = wp->config->neon_timeline;
wp->greetRequest.pg_version = PG_VERSION_NUM;
wp->greetRequest.system_id = wp->config->systemId;
wp->greetRequest.wal_seg_size = wp->config->wal_segment_size;
wp->api.init_event_set(wp);
@@ -165,12 +164,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
void
WalProposerFree(WalProposer *wp)
{
MembershipConfigurationFree(&wp->mconf);
for (int i = 0; i < wp->n_safekeepers; i++)
{
Safekeeper *sk = &wp->safekeeper[i];
Assert(sk->outbuf.data != NULL);
pfree(sk->outbuf.data);
MembershipConfigurationFree(&sk->greetResponse.mconf);
if (sk->voteResponse.termHistory.entries)
pfree(sk->voteResponse.termHistory.entries);
sk->voteResponse.termHistory.entries = NULL;
@@ -308,6 +309,7 @@ ShutdownConnection(Safekeeper *sk)
sk->state = SS_OFFLINE;
sk->streamingAt = InvalidXLogRecPtr;
MembershipConfigurationFree(&sk->greetResponse.mconf);
if (sk->voteResponse.termHistory.entries)
pfree(sk->voteResponse.termHistory.entries);
sk->voteResponse.termHistory.entries = NULL;
@@ -598,11 +600,14 @@ static void
SendStartWALPush(Safekeeper *sk)
{
WalProposer *wp = sk->wp;
#define CMD_LEN 512
char cmd[CMD_LEN];
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
if (!wp->api.conn_send_query(sk, cmd))
{
wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
wp_log(WARNING, "failed to send %s query to safekeeper %s:%s: %s",
cmd, sk->host, sk->port, wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return;
}
@@ -658,23 +663,33 @@ RecvStartWALPushResult(Safekeeper *sk)
/*
* Start handshake: first of all send information about the
* safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
* walproposer. After sending, we wait on SS_HANDSHAKE_RECV for
* a response to finish the handshake.
*/
static void
SendProposerGreeting(Safekeeper *sk)
{
WalProposer *wp = sk->wp;
char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf);
wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml);
pfree(mconf_toml);
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest,
&sk->outbuf, wp->config->proto_version);
/*
* On failure, logging & resetting the connection is handled. We just need
* to handle the control flow.
*/
BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV);
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
}
static void
RecvAcceptorGreeting(Safekeeper *sk)
{
WalProposer *wp = sk->wp;
char *mconf_toml;
/*
* If our reading doesn't immediately succeed, any necessary error
@@ -685,7 +700,10 @@ RecvAcceptorGreeting(Safekeeper *sk)
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
return;
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf);
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT,
sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
pfree(mconf_toml);
/* Protocol is all good, move to voting. */
sk->state = SS_VOTING;
@@ -707,12 +725,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
wp->propTerm++;
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
wp->voteRequest = (VoteRequest)
{
.tag = 'v',
.term = wp->propTerm
};
memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN);
wp->voteRequest.pam.tag = 'v';
wp->voteRequest.generation = wp->mconf.generation;
wp->voteRequest.term = wp->propTerm;
}
}
else if (sk->greetResponse.term > wp->propTerm)
@@ -759,12 +774,14 @@ SendVoteRequest(Safekeeper *sk)
{
WalProposer *wp = sk->wp;
/* We have quorum for voting, send our vote request */
wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
/* On failure, logging & resetting is handled */
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
return;
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest,
&sk->outbuf, wp->config->proto_version);
/* We have quorum for voting, send our vote request */
wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
wp->voteRequest.generation, wp->voteRequest.term);
/* On failure, logging & resetting is handled */
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
/* If successful, wait for read-ready with SS_WAIT_VERDICT */
}
@@ -778,11 +795,12 @@ RecvVoteResponse(Safekeeper *sk)
return;
wp_log(LOG,
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
"got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
sk->voteResponse.voteGiven,
GetHighestTerm(&sk->voteResponse.termHistory),
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
/*
* In case of acceptor rejecting our vote, bail out, but only if either it
@@ -847,9 +865,9 @@ HandleElectedProposer(WalProposer *wp)
* otherwise we must be sync-safekeepers and we have nothing to do then.
*
* Proceeding is not only pointless but harmful, because we'd give
* safekeepers term history starting with 0/0. These hacks will go away once
* we disable implicit timeline creation on safekeepers and create it with
* non zero LSN from the start.
* safekeepers term history starting with 0/0. These hacks will go away
* once we disable implicit timeline creation on safekeepers and create it
* with non zero LSN from the start.
*/
if (wp->propEpochStartLsn == InvalidXLogRecPtr)
{
@@ -942,7 +960,6 @@ DetermineEpochStartLsn(WalProposer *wp)
wp->propEpochStartLsn = InvalidXLogRecPtr;
wp->donorEpoch = 0;
wp->truncateLsn = InvalidXLogRecPtr;
wp->timelineStartLsn = InvalidXLogRecPtr;
for (int i = 0; i < wp->n_safekeepers; i++)
{
@@ -959,20 +976,6 @@ DetermineEpochStartLsn(WalProposer *wp)
wp->donor = i;
}
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
{
/* timelineStartLsn should be the same everywhere or unknown */
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
{
wp_log(WARNING,
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
LSN_FORMAT_ARGS(wp->timelineStartLsn),
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
}
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
}
}
}
@@ -995,22 +998,11 @@ DetermineEpochStartLsn(WalProposer *wp)
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
{
wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
if (wp->timelineStartLsn == InvalidXLogRecPtr)
{
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
}
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
}
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
/*
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so
* it should never be zero at this point, if we know timelineStartLsn.
*
* timelineStartLsn can be zero only on the first syncSafekeepers run.
*/
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);
/*
* We will be generating WAL since propEpochStartLsn, so we should set
@@ -1052,10 +1044,11 @@ DetermineEpochStartLsn(WalProposer *wp)
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
{
/*
* However, allow to proceed if last_log_term on the node which gave
* the highest vote (i.e. point where we are going to start writing)
* actually had been won by me; plain restart of walproposer not
* intervened by concurrent compute which wrote WAL is ok.
* However, allow to proceed if last_log_term on the node which
* gave the highest vote (i.e. point where we are going to start
* writing) actually had been won by me; plain restart of
* walproposer not intervened by concurrent compute which wrote
* WAL is ok.
*
* This avoids compute crash after manual term_bump.
*/
@@ -1125,14 +1118,8 @@ SendProposerElected(Safekeeper *sk)
{
/* safekeeper is empty or no common point, start from the beginning */
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
/*
* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
* is created manually (test_s3_wal_replay)
*/
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u",
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries);
}
else
{
@@ -1157,29 +1144,19 @@ SendProposerElected(Safekeeper *sk)
Assert(sk->startStreamingAt <= wp->availableLsn);
msg.tag = 'e';
msg.apm.tag = 'e';
msg.generation = wp->mconf.generation;
msg.term = wp->propTerm;
msg.startStreamingAt = sk->startStreamingAt;
msg.termHistory = &wp->propTermHistory;
msg.timelineStartLsn = wp->timelineStartLsn;
lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
wp_log(LOG,
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
resetStringInfo(&sk->outbuf);
pq_sendint64_le(&sk->outbuf, msg.tag);
pq_sendint64_le(&sk->outbuf, msg.term);
pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
for (int i = 0; i < msg.termHistory->n_entries; i++)
{
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
}
pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
"sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt),
lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version);
if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
return;
@@ -1245,14 +1222,13 @@ static void
PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
{
Assert(endLsn >= beginLsn);
req->tag = 'a';
req->apm.tag = 'a';
req->generation = wp->mconf.generation;
req->term = wp->propTerm;
req->epochStartLsn = wp->propEpochStartLsn;
req->beginLsn = beginLsn;
req->endLsn = endLsn;
req->commitLsn = wp->commitLsn;
req->truncateLsn = wp->truncateLsn;
req->proposerId = wp->greetRequest.proposerId;
}
/*
@@ -1353,7 +1329,8 @@ SendAppendRequests(Safekeeper *sk)
resetStringInfo(&sk->outbuf);
/* write AppendRequest header */
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version);
/* prepare for reading WAL into the outbuf */
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
sk->active_state = SS_ACTIVE_READ_WAL;
}
@@ -1366,14 +1343,17 @@ SendAppendRequests(Safekeeper *sk)
req = &sk->appendRequest;
req_len = req->endLsn - req->beginLsn;
/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
/*
* We send zero sized AppenRequests as heartbeats; don't wal_read
* for these.
*/
if (req_len > 0)
{
switch (wp->api.wal_read(sk,
&sk->outbuf.data[sk->outbuf.len],
req->beginLsn,
req_len,
&errmsg))
&sk->outbuf.data[sk->outbuf.len],
req->beginLsn,
req_len,
&errmsg))
{
case NEON_WALREAD_SUCCESS:
break;
@@ -1381,7 +1361,7 @@ SendAppendRequests(Safekeeper *sk)
return true;
case NEON_WALREAD_ERROR:
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port, errmsg);
sk->host, sk->port, errmsg);
ShutdownConnection(sk);
return false;
default:
@@ -1469,11 +1449,11 @@ RecvAppendResponses(Safekeeper *sk)
* Term has changed to higher one, probably another compute is
* running. If this is the case we could PANIC as well because
* likely it inserted some data and our basebackup is unsuitable
* anymore. However, we also bump term manually (term_bump endpoint)
* on safekeepers for migration purposes, in this case we do want
* compute to stay alive. So restart walproposer with FATAL instead
* of panicking; if basebackup is spoiled next election will notice
* this.
* anymore. However, we also bump term manually (term_bump
* endpoint) on safekeepers for migration purposes, in this case
* we do want compute to stay alive. So restart walproposer with
* FATAL instead of panicking; if basebackup is spoiled next
* election will notice this.
*/
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
sk->host, sk->port,
@@ -1749,6 +1729,208 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
}
}
/* Serialize MembershipConfiguration into buf. */
static void
MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf)
{
uint32 i;
pq_sendint32(buf, mconf->generation);
pq_sendint32(buf, mconf->members.len);
for (i = 0; i < mconf->members.len; i++)
{
pq_sendint64(buf, mconf->members.m[i].node_id);
pq_send_ascii_string(buf, mconf->members.m[i].host);
pq_sendint16(buf, mconf->members.m[i].port);
}
/*
* There is no special mark for absent new_members; zero members in
* invalid, so zero len means absent.
*/
pq_sendint32(buf, mconf->new_members.len);
for (i = 0; i < mconf->new_members.len; i++)
{
pq_sendint64(buf, mconf->new_members.m[i].node_id);
pq_send_ascii_string(buf, mconf->new_members.m[i].host);
pq_sendint16(buf, mconf->new_members.m[i].port);
}
}
/* Serialize proposer -> acceptor message into buf using specified version */
static void
PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version)
{
/* both version are supported currently until we fully migrate to 3 */
Assert(proto_version == 3 || proto_version == 2);
resetStringInfo(buf);
if (proto_version == 3)
{
/*
* v2 sends structs for some messages as is, so commonly send tag only
* for v3
*/
pq_sendint8(buf, msg->tag);
switch (msg->tag)
{
case 'g':
{
ProposerGreeting *m = (ProposerGreeting *) msg;
pq_send_ascii_string(buf, m->tenant_id);
pq_send_ascii_string(buf, m->timeline_id);
MembershipConfigurationSerialize(&m->mconf, buf);
pq_sendint32(buf, m->pg_version);
pq_sendint64(buf, m->system_id);
pq_sendint32(buf, m->wal_seg_size);
break;
}
case 'v':
{
VoteRequest *m = (VoteRequest *) msg;
pq_sendint32(buf, m->generation);
pq_sendint64(buf, m->term);
break;
}
case 'e':
{
ProposerElected *m = (ProposerElected *) msg;
pq_sendint32(buf, m->generation);
pq_sendint64(buf, m->term);
pq_sendint64(buf, m->startStreamingAt);
pq_sendint32(buf, m->termHistory->n_entries);
for (uint32 i = 0; i < m->termHistory->n_entries; i++)
{
pq_sendint64(buf, m->termHistory->entries[i].term);
pq_sendint64(buf, m->termHistory->entries[i].lsn);
}
break;
}
case 'a':
{
/*
* Note: this serializes only AppendRequestHeader, caller
* is expected to append WAL data later.
*/
AppendRequestHeader *m = (AppendRequestHeader *) msg;
pq_sendint32(buf, m->generation);
pq_sendint64(buf, m->term);
pq_sendint64(buf, m->beginLsn);
pq_sendint64(buf, m->endLsn);
pq_sendint64(buf, m->commitLsn);
pq_sendint64(buf, m->truncateLsn);
break;
}
default:
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
}
return;
}
if (proto_version == 2)
{
switch (msg->tag)
{
case 'g':
{
/* v2 sent struct as is */
ProposerGreeting *m = (ProposerGreeting *) msg;
ProposerGreetingV2 greetRequestV2;
/* Fill also v2 struct. */
greetRequestV2.tag = 'g';
greetRequestV2.protocolVersion = proto_version;
greetRequestV2.pgVersion = m->pg_version;
/*
* v3 removed this field because it's easier to pass as
* libq or START_WAL_PUSH options
*/
memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId));
greetRequestV2.systemId = wp->config->systemId;
if (*m->timeline_id != '\0' &&
!HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16))
wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id);
if (*m->tenant_id != '\0' &&
!HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16))
wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id);
greetRequestV2.timeline = wp->config->pgTimeline;
greetRequestV2.walSegSize = wp->config->wal_segment_size;
pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2));
break;
}
case 'v':
{
/* v2 sent struct as is */
VoteRequest *m = (VoteRequest *) msg;
VoteRequestV2 voteRequestV2;
voteRequestV2.tag = m->pam.tag;
voteRequestV2.term = m->term;
/* removed field */
memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId));
pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2));
break;
}
case 'e':
{
ProposerElected *m = (ProposerElected *) msg;
pq_sendint64_le(buf, m->apm.tag);
pq_sendint64_le(buf, m->term);
pq_sendint64_le(buf, m->startStreamingAt);
pq_sendint32_le(buf, m->termHistory->n_entries);
for (int i = 0; i < m->termHistory->n_entries; i++)
{
pq_sendint64_le(buf, m->termHistory->entries[i].term);
pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
}
pq_sendint64_le(buf, 0); /* removed timeline_start_lsn */
break;
}
case 'a':
/*
* Note: this serializes only AppendRequestHeader, caller is
* expected to append WAL data later.
*/
{
/* v2 sent struct as is */
AppendRequestHeader *m = (AppendRequestHeader *) msg;
AppendRequestHeaderV2 appendRequestHeaderV2;
appendRequestHeaderV2.tag = m->apm.tag;
appendRequestHeaderV2.term = m->term;
appendRequestHeaderV2.epochStartLsn = 0; /* removed field */
appendRequestHeaderV2.beginLsn = m->beginLsn;
appendRequestHeaderV2.endLsn = m->endLsn;
appendRequestHeaderV2.commitLsn = m->commitLsn;
appendRequestHeaderV2.truncateLsn = m->truncateLsn;
/* removed field */
memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId));
pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2));
break;
}
default:
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
}
return;
}
wp_log(FATAL, "unexpected proto_version %d", proto_version);
}
/*
* Try to read CopyData message from i'th safekeeper, resetting connection on
* failure.
@@ -1778,6 +1960,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
return false;
}
/* Deserialize membership configuration from buf to mconf. */
static void
MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf)
{
uint32 i;
mconf->generation = pq_getmsgint32(buf);
mconf->members.len = pq_getmsgint32(buf);
mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len);
for (i = 0; i < mconf->members.len; i++)
{
const char *buf_host;
mconf->members.m[i].node_id = pq_getmsgint64(buf);
buf_host = pq_getmsgrawstring(buf);
strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host));
mconf->members.m[i].port = pq_getmsgint16(buf);
}
mconf->new_members.len = pq_getmsgint32(buf);
mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len);
for (i = 0; i < mconf->new_members.len; i++)
{
const char *buf_host;
mconf->new_members.m[i].node_id = pq_getmsgint64(buf);
buf_host = pq_getmsgrawstring(buf);
strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host));
mconf->new_members.m[i].port = pq_getmsgint16(buf);
}
}
/*
* Read next message with known type into provided struct, by reading a CopyData
* block from the safekeeper's postgres connection, returning whether the read
@@ -1786,6 +1999,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
* If the read needs more polling, we return 'false' and keep the state
* unmodified, waiting until it becomes read-ready to try again. If it fully
* failed, a warning is emitted and the connection is reset.
*
* Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields.
*/
static bool
AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
@@ -1794,82 +2009,153 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
char *buf;
int buf_size;
uint64 tag;
uint8 tag;
StringInfoData s;
if (!(AsyncRead(sk, &buf, &buf_size)))
return false;
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
/* parse it */
s.data = buf;
s.len = buf_size;
s.maxlen = buf_size;
s.cursor = 0;
tag = pq_getmsgint64_le(&s);
if (tag != anymsg->tag)
if (wp->config->proto_version == 3)
{
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
sk->port, FormatSafekeeperState(sk));
ResetConnection(sk);
return false;
}
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
switch (tag)
{
case 'g':
{
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
msg->term = pq_getmsgint64_le(&s);
msg->nodeId = pq_getmsgint64_le(&s);
pq_getmsgend(&s);
return true;
}
case 'v':
{
VoteResponse *msg = (VoteResponse *) anymsg;
msg->term = pq_getmsgint64_le(&s);
msg->voteGiven = pq_getmsgint64_le(&s);
msg->flushLsn = pq_getmsgint64_le(&s);
msg->truncateLsn = pq_getmsgint64_le(&s);
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
for (int i = 0; i < msg->termHistory.n_entries; i++)
tag = pq_getmsgbyte(&s);
if (tag != anymsg->tag)
{
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
sk->port, FormatSafekeeperState(sk));
ResetConnection(sk);
return false;
}
switch (tag)
{
case 'g':
{
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
msg->nodeId = pq_getmsgint64(&s);
MembershipConfigurationDeserialize(&msg->mconf, &s);
msg->term = pq_getmsgint64(&s);
pq_getmsgend(&s);
return true;
}
msg->timelineStartLsn = pq_getmsgint64_le(&s);
pq_getmsgend(&s);
return true;
}
case 'v':
{
VoteResponse *msg = (VoteResponse *) anymsg;
case 'a':
{
AppendResponse *msg = (AppendResponse *) anymsg;
msg->generation = pq_getmsgint32(&s);
msg->term = pq_getmsgint64(&s);
msg->voteGiven = pq_getmsgbyte(&s);
msg->flushLsn = pq_getmsgint64(&s);
msg->truncateLsn = pq_getmsgint64(&s);
msg->termHistory.n_entries = pq_getmsgint32(&s);
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
for (uint32 i = 0; i < msg->termHistory.n_entries; i++)
{
msg->termHistory.entries[i].term = pq_getmsgint64(&s);
msg->termHistory.entries[i].lsn = pq_getmsgint64(&s);
}
pq_getmsgend(&s);
return true;
}
case 'a':
{
AppendResponse *msg = (AppendResponse *) anymsg;
msg->term = pq_getmsgint64_le(&s);
msg->flushLsn = pq_getmsgint64_le(&s);
msg->commitLsn = pq_getmsgint64_le(&s);
msg->hs.ts = pq_getmsgint64_le(&s);
msg->hs.xmin.value = pq_getmsgint64_le(&s);
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
if (s.len > s.cursor)
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
else
msg->ps_feedback.present = false;
pq_getmsgend(&s);
return true;
}
default:
{
Assert(false);
return false;
}
msg->generation = pq_getmsgint32(&s);
msg->term = pq_getmsgint64(&s);
msg->flushLsn = pq_getmsgint64(&s);
msg->commitLsn = pq_getmsgint64(&s);
msg->hs.ts = pq_getmsgint64(&s);
msg->hs.xmin.value = pq_getmsgint64(&s);
msg->hs.catalog_xmin.value = pq_getmsgint64(&s);
if (s.len > s.cursor)
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
else
msg->ps_feedback.present = false;
pq_getmsgend(&s);
return true;
}
default:
{
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
return false;
}
}
}
else if (wp->config->proto_version == 2)
{
tag = pq_getmsgint64_le(&s);
if (tag != anymsg->tag)
{
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
sk->port, FormatSafekeeperState(sk));
ResetConnection(sk);
return false;
}
switch (tag)
{
case 'g':
{
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
msg->term = pq_getmsgint64_le(&s);
msg->nodeId = pq_getmsgint64_le(&s);
pq_getmsgend(&s);
return true;
}
case 'v':
{
VoteResponse *msg = (VoteResponse *) anymsg;
msg->term = pq_getmsgint64_le(&s);
msg->voteGiven = pq_getmsgint64_le(&s);
msg->flushLsn = pq_getmsgint64_le(&s);
msg->truncateLsn = pq_getmsgint64_le(&s);
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
for (int i = 0; i < msg->termHistory.n_entries; i++)
{
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
}
pq_getmsgint64_le(&s); /* timelineStartLsn */
pq_getmsgend(&s);
return true;
}
case 'a':
{
AppendResponse *msg = (AppendResponse *) anymsg;
msg->term = pq_getmsgint64_le(&s);
msg->flushLsn = pq_getmsgint64_le(&s);
msg->commitLsn = pq_getmsgint64_le(&s);
msg->hs.ts = pq_getmsgint64_le(&s);
msg->hs.xmin.value = pq_getmsgint64_le(&s);
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
if (s.len > s.cursor)
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
else
msg->ps_feedback.present = false;
pq_getmsgend(&s);
return true;
}
default:
{
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
return false;
}
}
}
wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
}
/*
@@ -2245,3 +2531,45 @@ FormatEvents(WalProposer *wp, uint32 events)
return (char *) &return_str;
}
/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */
static char *
MembershipConfigurationToString(MembershipConfiguration *mconf)
{
StringInfoData s;
uint32 i;
initStringInfo(&s);
appendStringInfo(&s, "{gen = %u", mconf->generation);
appendStringInfoString(&s, ", members = [");
for (i = 0; i < mconf->members.len; i++)
{
if (i > 0)
appendStringInfoString(&s, ", ");
appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id);
appendStringInfo(&s, ", host = %s", mconf->members.m[i].host);
appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port);
}
appendStringInfo(&s, "], new_members = [");
for (i = 0; i < mconf->new_members.len; i++)
{
if (i > 0)
appendStringInfoString(&s, ", ");
appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id);
appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host);
appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port);
}
appendStringInfoString(&s, "]}");
return s.data;
}
static void
MembershipConfigurationFree(MembershipConfiguration *mconf)
{
if (mconf->members.m)
pfree(mconf->members.m);
mconf->members.m = NULL;
if (mconf->new_members.m)
pfree(mconf->new_members.m);
mconf->new_members.m = NULL;
}

View File

@@ -12,9 +12,6 @@
#include "neon_walreader.h"
#include "pagestore_client.h"
#define SK_MAGIC 0xCafeCeefu
#define SK_PROTOCOL_VERSION 2
#define MAX_SAFEKEEPERS 32
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL
* message */
@@ -143,12 +140,71 @@ typedef uint64 term_t;
/* neon storage node id */
typedef uint64 NNodeId;
/*
* Number uniquely identifying safekeeper membership configuration.
* This and following structs pair ones in membership.rs.
*/
typedef uint32 Generation;
typedef struct SafekeeperId
{
NNodeId node_id;
char host[MAXCONNINFO];
uint16 port;
} SafekeeperId;
/* Set of safekeepers. */
typedef struct MemberSet
{
uint32 len; /* number of members */
SafekeeperId *m; /* ids themselves */
} MemberSet;
/* Timeline safekeeper membership configuration. */
typedef struct MembershipConfiguration
{
Generation generation;
MemberSet members;
/* Has 0 n_members in non joint conf. */
MemberSet new_members;
} MembershipConfiguration;
/*
* Proposer <-> Acceptor messaging.
*/
typedef struct ProposerAcceptorMessage
{
uint8 tag;
} ProposerAcceptorMessage;
/* Initial Proposer -> Acceptor message */
typedef struct ProposerGreeting
{
ProposerAcceptorMessage pam; /* message tag */
/*
* tenant/timeline ids as C strings with standard hex notation for ease of
* printing. In principle they are not strictly needed as ttid is also
* passed as libpq options.
*/
char *tenant_id;
char *timeline_id;
/* Full conf is carried to allow safekeeper switch */
MembershipConfiguration mconf;
/*
* pg_version and wal_seg_size are used for timeline creation until we
* fully migrate to doing externally. systemId is only used as a sanity
* cross check.
*/
uint32 pg_version; /* in PG_VERSION_NUM format */
uint64 system_id; /* Postgres system identifier. */
uint32 wal_seg_size;
} ProposerGreeting;
/* protocol v2 variant, kept while wp supports it */
typedef struct ProposerGreetingV2
{
uint64 tag; /* message tag */
uint32 protocolVersion; /* proposer-safekeeper protocol version */
@@ -159,32 +215,42 @@ typedef struct ProposerGreeting
uint8 tenant_id[16];
TimeLineID timeline;
uint32 walSegSize;
} ProposerGreeting;
} ProposerGreetingV2;
typedef struct AcceptorProposerMessage
{
uint64 tag;
uint8 tag;
} AcceptorProposerMessage;
/*
* Acceptor -> Proposer initial response: the highest term acceptor voted for.
* Acceptor -> Proposer initial response: the highest term acceptor voted for,
* its node id and configuration.
*/
typedef struct AcceptorGreeting
{
AcceptorProposerMessage apm;
term_t term;
NNodeId nodeId;
MembershipConfiguration mconf;
term_t term;
} AcceptorGreeting;
/*
* Proposer -> Acceptor vote request.
*/
typedef struct VoteRequest
{
ProposerAcceptorMessage pam; /* message tag */
Generation generation; /* membership conf generation */
term_t term;
} VoteRequest;
/* protocol v2 variant, kept while wp supports it */
typedef struct VoteRequestV2
{
uint64 tag;
term_t term;
pg_uuid_t proposerId; /* for monitoring/debugging */
} VoteRequest;
} VoteRequestV2;
/* Element of term switching chain. */
typedef struct TermSwitchEntry
@@ -203,8 +269,15 @@ typedef struct TermHistory
typedef struct VoteResponse
{
AcceptorProposerMessage apm;
/*
* Membership conf generation. It's not strictly required because on
* mismatch safekeeper is expected to ERROR the connection, but let's
* sanity check it.
*/
Generation generation;
term_t term;
uint64 voteGiven;
uint8 voteGiven;
/*
* Safekeeper flush_lsn (end of WAL) + history of term switches allow
@@ -214,7 +287,6 @@ typedef struct VoteResponse
XLogRecPtr truncateLsn; /* minimal LSN which may be needed for*
* recovery of some safekeeper */
TermHistory termHistory;
XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
} VoteResponse;
/*
@@ -223,20 +295,37 @@ typedef struct VoteResponse
*/
typedef struct ProposerElected
{
uint64 tag;
AcceptorProposerMessage apm;
Generation generation; /* membership conf generation */
term_t term;
/* proposer will send since this point */
XLogRecPtr startStreamingAt;
/* history of term switches up to this proposer */
TermHistory *termHistory;
/* timeline globally starts at this LSN */
XLogRecPtr timelineStartLsn;
} ProposerElected;
/*
* Header of request with WAL message sent from proposer to safekeeper.
*/
typedef struct AppendRequestHeader
{
AcceptorProposerMessage apm;
Generation generation; /* membership conf generation */
term_t term; /* term of the proposer */
XLogRecPtr beginLsn; /* start position of message in WAL */
XLogRecPtr endLsn; /* end position of message in WAL */
XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */
/*
* minimal LSN which may be needed for recovery of some safekeeper (end
* lsn + 1 of last chunk streamed to everyone)
*/
XLogRecPtr truncateLsn;
/* in the AppendRequest message, WAL data follows */
} AppendRequestHeader;
/* protocol v2 variant, kept while wp supports it */
typedef struct AppendRequestHeaderV2
{
uint64 tag;
term_t term; /* term of the proposer */
@@ -256,7 +345,8 @@ typedef struct AppendRequestHeader
*/
XLogRecPtr truncateLsn;
pg_uuid_t proposerId; /* for monitoring/debugging */
} AppendRequestHeader;
/* in the AppendRequest message, WAL data follows */
} AppendRequestHeaderV2;
/*
* Hot standby feedback received from replica
@@ -309,6 +399,13 @@ typedef struct AppendResponse
{
AcceptorProposerMessage apm;
/*
* Membership conf generation. It's not strictly required because on
* mismatch safekeeper is expected to ERROR the connection, but let's
* sanity check it.
*/
Generation generation;
/*
* Current term of the safekeeper; if it is higher than proposer's, the
* compute is out of date.
@@ -644,6 +741,8 @@ typedef struct WalProposerConfig
/* Will be passed to safekeepers in greet request. */
TimeLineID pgTimeline;
int proto_version;
#ifdef WALPROPOSER_LIB
void *callback_data;
#endif
@@ -656,11 +755,14 @@ typedef struct WalProposerConfig
typedef struct WalProposer
{
WalProposerConfig *config;
int n_safekeepers;
/* Current walproposer membership configuration */
MembershipConfiguration mconf;
/* (n_safekeepers / 2) + 1 */
int quorum;
/* Number of occupied slots in safekeepers[] */
int n_safekeepers;
Safekeeper safekeeper[MAX_SAFEKEEPERS];
/* WAL has been generated up to this point */
@@ -670,6 +772,7 @@ typedef struct WalProposer
XLogRecPtr commitLsn;
ProposerGreeting greetRequest;
ProposerGreetingV2 greetRequestV2;
/* Vote request for safekeeper */
VoteRequest voteRequest;

View File

@@ -155,6 +155,16 @@ pq_getmsgend(StringInfo msg)
ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
}
/* --------------------------------
* pq_sendbytes - append raw data to a StringInfo buffer
* --------------------------------
*/
void
pq_sendbytes(StringInfo buf, const void *data, int datalen)
{
/* use variant that maintains a trailing null-byte, out of caution */
appendBinaryStringInfo(buf, data, datalen);
}
/*
* Produce a C-string representation of a TimestampTz.

View File

@@ -59,9 +59,11 @@
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
/* GUCs */
char *wal_acceptors_list = "";
int wal_acceptor_reconnect_timeout = 1000;
int wal_acceptor_connection_timeout = 10000;
int safekeeper_proto_version = 2;
/* Set to true in the walproposer bgw. */
static bool am_walproposer;
@@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers)
else
walprop_config.systemId = 0;
walprop_config.pgTimeline = walprop_pg_get_timeline_id();
walprop_config.proto_version = safekeeper_proto_version;
}
/*
@@ -219,25 +222,37 @@ nwp_register_gucs(void)
PGC_SIGHUP,
GUC_UNIT_MS,
NULL, NULL, NULL);
DefineCustomIntVariable(
"neon.safekeeper_proto_version",
"Version of compute <-> safekeeper protocol.",
"Used while migrating from 2 to 3.",
&safekeeper_proto_version,
2, 0, INT_MAX,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
}
static int
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
{
int n_safekeepers = 0;
char *curr_sk = safekeepers_list;
int n_safekeepers = 0;
char *curr_sk = safekeepers_list;
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
{
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
if (++n_safekeepers >= MAX_SAFEKEEPERS)
{
wpg_log(FATAL, "too many safekeepers");
}
coma = strchr(coma, ',');
safekeepers[n_safekeepers-1] = curr_sk;
safekeepers[n_safekeepers - 1] = curr_sk;
if (coma != NULL) {
if (coma != NULL)
{
*coma++ = '\0';
}
}
@@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
static bool
safekeepers_cmp(char *old, char *new)
{
char *safekeepers_old[MAX_SAFEKEEPERS];
char *safekeepers_new[MAX_SAFEKEEPERS];
int len_old = 0;
int len_new = 0;
char *safekeepers_old[MAX_SAFEKEEPERS];
char *safekeepers_new[MAX_SAFEKEEPERS];
int len_old = 0;
int len_new = 0;
len_old = split_safekeepers_list(old, safekeepers_old);
len_new = split_safekeepers_list(new, safekeepers_new);
@@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra)
if (!am_walproposer)
return;
if (!newval) {
if (!newval)
{
/* should never happen */
wpg_log(FATAL, "neon.safekeepers is empty");
}
@@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra)
newval_copy = pstrdup(newval);
oldval = pstrdup(wal_acceptors_list);
/*
/*
* TODO: restarting through FATAL is stupid and introduces 1s delay before
* next bgw start. We should refactor walproposer to allow graceful exit and
* thus remove this delay.
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
* next bgw start. We should refactor walproposer to allow graceful exit
* and thus remove this delay. XXX: If you change anything here, sync with
* test_safekeepers_reconfigure_reorder.
*/
if (!safekeepers_cmp(oldval, newval_copy))
{
@@ -454,7 +470,8 @@ backpressure_throttling_impl(void)
memcpy(new_status, old_status, len);
snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
set_ps_display(new_status);
new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
new_status[len] = '\0'; /* truncate off " backpressure ..." to later
* reset the ps */
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
start = GetCurrentTimestamp();
@@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
LSN_FORMAT_ARGS(startpos));
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
cmd.timeline = wp->greetRequest.timeline;
cmd.timeline = wp->config->pgTimeline;
cmd.startpoint = startpos;
StartProposerReplication(wp, &cmd);
}
@@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
FullTransactionId xmin = hsFeedback.xmin;
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
FullTransactionId next_xid = ReadNextFullTransactionId();
/*
* Page server is updating nextXid in checkpoint each 1024 transactions,
* so feedback xmin can be actually larger then nextXid and
* function TransactionIdInRecentPast return false in this case,
* Page server is updating nextXid in checkpoint each 1024
* transactions, so feedback xmin can be actually larger then nextXid
* and function TransactionIdInRecentPast return false in this case,
* preventing update of slot's xmin.
*/
if (FullTransactionIdPrecedes(next_xid, xmin))

View File

@@ -26,7 +26,6 @@ hex.workspace = true
humantime.workspace = true
http.workspace = true
hyper0.workspace = true
itertools.workspace = true
futures.workspace = true
once_cell.workspace = true
parking_lot.workspace = true
@@ -40,7 +39,6 @@ scopeguard.workspace = true
reqwest = { workspace = true, features = ["json"] }
serde.workspace = true
serde_json.workspace = true
smallvec.workspace = true
strum.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
@@ -65,7 +63,6 @@ storage_broker.workspace = true
tokio-stream.workspace = true
utils.workspace = true
wal_decoder.workspace = true
env_logger.workspace = true
workspace_hack.workspace = true

View File

@@ -102,11 +102,6 @@ impl Client {
self.get(&uri).await
}
pub async fn utilization(&self) -> Result<reqwest::Response> {
let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
self.get(&uri).await
}
async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
self.request(Method::GET, uri, ()).await
}

View File

@@ -207,13 +207,6 @@ struct Args {
/// Also defines interval for eviction retries.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
eviction_min_resident: Duration,
/// Enable fanning out WAL to different shards from the same reader
#[arg(long)]
wal_reader_fanout: bool,
/// Only fan out the WAL reader if the absoulte delta between the new requested position
/// and the current position of the reader is smaller than this value.
#[arg(long)]
max_delta_for_fanout: Option<u64>,
}
// Like PathBufValueParser, but allows empty string.
@@ -377,8 +370,6 @@ async fn main() -> anyhow::Result<()> {
control_file_save_interval: args.control_file_save_interval,
partial_backup_concurrency: args.partial_backup_concurrency,
eviction_min_resident: args.eviction_min_resident,
wal_reader_fanout: args.wal_reader_fanout,
max_delta_for_fanout: args.max_delta_for_fanout,
});
// initialize sentry if SENTRY_DSN is provided

View File

@@ -127,13 +127,6 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
json_response(StatusCode::OK, ())
}
async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let global_timelines = get_global_timelines(&request);
let utilization = global_timelines.get_timeline_counts();
json_response(StatusCode::OK, utilization)
}
/// List all (not deleted) timelines.
/// Note: it is possible to do the same with debug_dump.
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -202,7 +195,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
peer_horizon_lsn: inmem.peer_horizon_lsn,
remote_consistent_lsn: inmem.remote_consistent_lsn,
peers: tli.get_peers(conf).await,
walsenders: tli.get_walsenders().get_all_public(),
walsenders: tli.get_walsenders().get_all(),
walreceivers: tli.get_walreceivers().get_all(),
};
json_response(StatusCode::OK, status)
@@ -627,7 +620,6 @@ pub fn make_router(
failpoints_handler(r, cancel).await
})
})
.get("/v1/uzilization", |r| request_span(r, utilization_handler))
.delete("/v1/tenant/:tenant_id", |r| {
request_span(r, tenant_delete_handler)
})

View File

@@ -8,7 +8,7 @@
use anyhow::Context;
use postgres_backend::QueryError;
use safekeeper_api::membership::Configuration;
use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
use safekeeper_api::{ServerInfo, Term};
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
@@ -133,10 +133,10 @@ async fn send_proposer_elected(
let history = TermHistory(history_entries);
let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
generation: INVALID_GENERATION,
term,
start_streaming_at: lsn,
term_history: history,
timeline_start_lsn: lsn,
});
tli.process_msg(&proposer_elected_request).await?;
@@ -170,13 +170,12 @@ pub async fn append_logical_message(
let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
h: AppendRequestHeader {
generation: INVALID_GENERATION,
term: msg.term,
term_start_lsn: begin_lsn,
begin_lsn,
end_lsn,
commit_lsn,
truncate_lsn: msg.truncate_lsn,
proposer_uuid: [0u8; 16],
},
wal_data,
});

View File

@@ -108,8 +108,6 @@ pub struct SafeKeeperConf {
pub control_file_save_interval: Duration,
pub partial_backup_concurrency: usize,
pub eviction_min_resident: Duration,
pub wal_reader_fanout: bool,
pub max_delta_for_fanout: Option<u64>,
}
impl SafeKeeperConf {
@@ -152,8 +150,6 @@ impl SafeKeeperConf {
control_file_save_interval: Duration::from_secs(1),
partial_backup_concurrency: 1,
eviction_min_resident: Duration::ZERO,
wal_reader_fanout: false,
max_delta_for_fanout: None,
}
}
}

View File

@@ -12,9 +12,9 @@ use metrics::{
pow2_buckets,
proto::MetricFamily,
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
};
use once_cell::sync::Lazy;
use postgres_ffi::XLogSegNo;
@@ -211,14 +211,6 @@ pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
)
.expect("Failed to register safekeeper_wal_receivers")
});
pub static WAL_READERS: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"safekeeper_wal_readers",
"Number of active WAL readers (may serve pageservers or other safekeepers)",
&["kind", "target"]
)
.expect("Failed to register safekeeper_wal_receivers")
});
pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
// Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
// full queues respectively.
@@ -451,7 +443,6 @@ pub struct FullTimelineInfo {
pub timeline_is_active: bool,
pub num_computes: u32,
pub last_removed_segno: XLogSegNo,
pub interpreted_wal_reader_tasks: usize,
pub epoch_start_lsn: Lsn,
pub mem_state: TimelineMemState,
@@ -481,7 +472,6 @@ pub struct TimelineCollector {
disk_usage: GenericGaugeVec<AtomicU64>,
acceptor_term: GenericGaugeVec<AtomicU64>,
written_wal_bytes: GenericGaugeVec<AtomicU64>,
interpreted_wal_reader_tasks: GenericGaugeVec<AtomicU64>,
written_wal_seconds: GaugeVec,
flushed_wal_seconds: GaugeVec,
collect_timeline_metrics: Gauge,
@@ -680,16 +670,6 @@ impl TimelineCollector {
.unwrap();
descs.extend(active_timelines_count.desc().into_iter().cloned());
let interpreted_wal_reader_tasks = GenericGaugeVec::new(
Opts::new(
"safekeeper_interpreted_wal_reader_tasks",
"Number of active interpreted wal reader tasks, grouped by timeline",
),
&["tenant_id", "timeline_id"],
)
.unwrap();
descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned());
TimelineCollector {
global_timelines,
descs,
@@ -713,7 +693,6 @@ impl TimelineCollector {
collect_timeline_metrics,
timelines_count,
active_timelines_count,
interpreted_wal_reader_tasks,
}
}
}
@@ -742,7 +721,6 @@ impl Collector for TimelineCollector {
self.disk_usage.reset();
self.acceptor_term.reset();
self.written_wal_bytes.reset();
self.interpreted_wal_reader_tasks.reset();
self.written_wal_seconds.reset();
self.flushed_wal_seconds.reset();
@@ -804,9 +782,6 @@ impl Collector for TimelineCollector {
self.written_wal_bytes
.with_label_values(labels)
.set(tli.wal_storage.write_wal_bytes);
self.interpreted_wal_reader_tasks
.with_label_values(labels)
.set(tli.interpreted_wal_reader_tasks as u64);
self.written_wal_seconds
.with_label_values(labels)
.set(tli.wal_storage.write_wal_seconds);
@@ -859,7 +834,6 @@ impl Collector for TimelineCollector {
mfs.extend(self.disk_usage.collect());
mfs.extend(self.acceptor_term.collect());
mfs.extend(self.written_wal_bytes.collect());
mfs.extend(self.interpreted_wal_reader_tasks.collect());
mfs.extend(self.written_wal_seconds.collect());
mfs.extend(self.flushed_wal_seconds.collect());

View File

@@ -281,7 +281,7 @@ impl SafekeeperPostgresHandler {
tokio::select! {
// todo: add read|write .context to these errors
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r,
_ = timeline_cancel.cancelled() => {
return Err(CopyStreamHandlerEnd::Cancelled);
}
@@ -342,8 +342,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
let tli = match next_msg {
ProposerAcceptorMessage::Greeting(ref greeting) => {
info!(
"start handshake with walproposer {} sysid {} timeline {}",
self.peer_addr, greeting.system_id, greeting.tli,
"start handshake with walproposer {} sysid {}",
self.peer_addr, greeting.system_id,
);
let server_info = ServerInfo {
pg_version: greeting.pg_version,
@@ -459,6 +459,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
pgb_writer: &mut PostgresBackend<IO>,
mut reply_rx: Receiver<AcceptorProposerMessage>,
mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
proto_version: u32,
) -> Result<(), CopyStreamHandlerEnd> {
let mut buf = BytesMut::with_capacity(128);
@@ -496,7 +497,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
};
buf.clear();
msg.serialize(&mut buf)?;
msg.serialize(&mut buf, proto_version)?;
pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
}
}

View File

@@ -7,6 +7,7 @@ use std::{fmt, pin::pin};
use anyhow::{bail, Context};
use futures::StreamExt;
use postgres_protocol::message::backend::ReplicationMessage;
use safekeeper_api::membership::INVALID_GENERATION;
use safekeeper_api::models::{PeerInfo, TimelineStatus};
use safekeeper_api::Term;
use tokio::sync::mpsc::{channel, Receiver, Sender};
@@ -267,7 +268,10 @@ async fn recover(
);
// Now understand our term history.
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
generation: INVALID_GENERATION,
term: donor.term,
});
let vote_response = match tli
.process_msg(&vote_request)
.await
@@ -302,10 +306,10 @@ async fn recover(
// truncate WAL locally
let pe = ProposerAcceptorMessage::Elected(ProposerElected {
generation: INVALID_GENERATION,
term: donor.term,
start_streaming_at: last_common_point.lsn,
term_history: donor_th,
timeline_start_lsn: Lsn::INVALID,
});
// Successful ProposerElected handling always returns None. If term changed,
// we'll find out that during the streaming. Note: it is expected to get
@@ -434,13 +438,12 @@ async fn network_io(
match msg {
ReplicationMessage::XLogData(xlog_data) => {
let ar_hdr = AppendRequestHeader {
generation: INVALID_GENERATION,
term: donor.term,
term_start_lsn: Lsn::INVALID, // unused
begin_lsn: Lsn(xlog_data.wal_start()),
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
truncate_lsn: Lsn::INVALID, // do not attempt to advance
proposer_uuid: [0; 16],
};
let ar = AppendRequest {
h: ar_hdr,

View File

@@ -5,6 +5,11 @@ use byteorder::{LittleEndian, ReadBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
use safekeeper_api::membership;
use safekeeper_api::membership::Generation;
use safekeeper_api::membership::MemberSet;
use safekeeper_api::membership::SafekeeperId;
use safekeeper_api::membership::INVALID_GENERATION;
use safekeeper_api::models::HotStandbyFeedback;
use safekeeper_api::Term;
use serde::{Deserialize, Serialize};
@@ -12,6 +17,7 @@ use std::cmp::max;
use std::cmp::min;
use std::fmt;
use std::io::Read;
use std::str::FromStr;
use storage_broker::proto::SafekeeperTimelineInfo;
use tracing::*;
@@ -29,7 +35,8 @@ use utils::{
lsn::Lsn,
};
pub const SK_PROTOCOL_VERSION: u32 = 2;
pub const SK_PROTO_VERSION_2: u32 = 2;
pub const SK_PROTO_VERSION_3: u32 = 3;
pub const UNKNOWN_SERVER_VERSION: u32 = 0;
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@@ -56,8 +63,28 @@ impl TermHistory {
TermHistory(Vec::new())
}
// Parse TermHistory as n_entries followed by TermLsn pairs
// Parse TermHistory as n_entries followed by TermLsn pairs in network order.
pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
let n_entries = bytes
.get_u32_f()
.with_context(|| "TermHistory misses len")?;
let mut res = Vec::with_capacity(n_entries as usize);
for i in 0..n_entries {
let term = bytes
.get_u64_f()
.with_context(|| format!("TermHistory pos {} misses term", i))?;
let lsn = bytes
.get_u64_f()
.with_context(|| format!("TermHistory pos {} misses lsn", i))?
.into();
res.push(TermLsn { term, lsn })
}
Ok(TermHistory(res))
}
// Parse TermHistory as n_entries followed by TermLsn pairs in LE order.
// TODO remove once v2 protocol is fully dropped.
pub fn from_bytes_le(bytes: &mut Bytes) -> Result<TermHistory> {
if bytes.remaining() < 4 {
bail!("TermHistory misses len");
}
@@ -197,6 +224,18 @@ impl AcceptorState {
/// Initial Proposer -> Acceptor message
#[derive(Debug, Deserialize)]
pub struct ProposerGreeting {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub mconf: membership::Configuration,
/// Postgres server version
pub pg_version: u32,
pub system_id: SystemId,
pub wal_seg_size: u32,
}
/// V2 of the message; exists as a struct because we (de)serialized it as is.
#[derive(Debug, Deserialize)]
pub struct ProposerGreetingV2 {
/// proposer-acceptor protocol version
pub protocol_version: u32,
/// Postgres server version
@@ -213,27 +252,35 @@ pub struct ProposerGreeting {
/// (acceptor voted for).
#[derive(Debug, Serialize)]
pub struct AcceptorGreeting {
term: u64,
node_id: NodeId,
mconf: membership::Configuration,
term: u64,
}
/// Vote request sent from proposer to safekeepers
#[derive(Debug, Deserialize)]
#[derive(Debug)]
pub struct VoteRequest {
pub generation: Generation,
pub term: Term,
}
/// V2 of the message; exists as a struct because we (de)serialized it as is.
#[derive(Debug, Deserialize)]
pub struct VoteRequestV2 {
pub term: Term,
}
/// Vote itself, sent from safekeeper to proposer
#[derive(Debug, Serialize)]
pub struct VoteResponse {
generation: Generation, // membership conf generation
pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
vote_given: u64, // fixme u64 due to padding
vote_given: bool,
// Safekeeper flush_lsn (end of WAL) + history of term switches allow
// proposer to choose the most advanced one.
pub flush_lsn: Lsn,
truncate_lsn: Lsn,
pub term_history: TermHistory,
timeline_start_lsn: Lsn,
}
/*
@@ -242,10 +289,10 @@ pub struct VoteResponse {
*/
#[derive(Debug)]
pub struct ProposerElected {
pub generation: Generation, // membership conf generation
pub term: Term,
pub start_streaming_at: Lsn,
pub term_history: TermHistory,
pub timeline_start_lsn: Lsn,
}
/// Request with WAL message sent from proposer to safekeeper. Along the way it
@@ -257,6 +304,22 @@ pub struct AppendRequest {
}
#[derive(Debug, Clone, Deserialize)]
pub struct AppendRequestHeader {
pub generation: Generation, // membership conf generation
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
pub term: Term,
/// start position of message in WAL
pub begin_lsn: Lsn,
/// end position of message in WAL
pub end_lsn: Lsn,
/// LSN committed by quorum of safekeepers
pub commit_lsn: Lsn,
/// minimal LSN which may be needed by proposer to perform recovery of some safekeeper
pub truncate_lsn: Lsn,
}
/// V2 of the message; exists as a struct because we (de)serialized it as is.
#[derive(Debug, Clone, Deserialize)]
pub struct AppendRequestHeaderV2 {
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
pub term: Term,
// TODO: remove this field from the protocol, it in unused -- LSN of term
@@ -277,6 +340,9 @@ pub struct AppendRequestHeader {
/// Report safekeeper state to proposer
#[derive(Debug, Serialize, Clone)]
pub struct AppendResponse {
// Membership conf generation. Not strictly required because on mismatch
// connection is reset, but let's sanity check it.
generation: Generation,
// Current term of the safekeeper; if it is higher than proposer's, the
// compute is out of date.
pub term: Term,
@@ -293,8 +359,9 @@ pub struct AppendResponse {
}
impl AppendResponse {
fn term_only(term: Term) -> AppendResponse {
fn term_only(generation: Generation, term: Term) -> AppendResponse {
AppendResponse {
generation,
term,
flush_lsn: Lsn(0),
commit_lsn: Lsn(0),
@@ -315,72 +382,317 @@ pub enum ProposerAcceptorMessage {
FlushWAL,
}
impl ProposerAcceptorMessage {
/// Parse proposer message.
pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
if proto_version != SK_PROTOCOL_VERSION {
bail!(
"incompatible protocol version {}, expected {}",
proto_version,
SK_PROTOCOL_VERSION
);
/// Augment Bytes with fallible get_uN where N is number of bytes methods.
/// All reads are in network (big endian) order.
trait BytesF {
fn get_u8_f(&mut self) -> Result<u8>;
fn get_u16_f(&mut self) -> Result<u16>;
fn get_u32_f(&mut self) -> Result<u32>;
fn get_u64_f(&mut self) -> Result<u64>;
}
impl BytesF for Bytes {
fn get_u8_f(&mut self) -> Result<u8> {
if self.is_empty() {
bail!("no bytes left, expected 1");
}
// xxx using Reader is inefficient but easy to work with bincode
let mut stream = msg_bytes.reader();
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
match tag {
'g' => {
let msg = ProposerGreeting::des_from(&mut stream)?;
Ok(ProposerAcceptorMessage::Greeting(msg))
}
'v' => {
let msg = VoteRequest::des_from(&mut stream)?;
Ok(ProposerAcceptorMessage::VoteRequest(msg))
}
'e' => {
let mut msg_bytes = stream.into_inner();
if msg_bytes.remaining() < 16 {
bail!("ProposerElected message is not complete");
}
let term = msg_bytes.get_u64_le();
let start_streaming_at = msg_bytes.get_u64_le().into();
let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
if msg_bytes.remaining() < 8 {
bail!("ProposerElected message is not complete");
}
let timeline_start_lsn = msg_bytes.get_u64_le().into();
let msg = ProposerElected {
term,
start_streaming_at,
timeline_start_lsn,
term_history,
Ok(self.get_u8())
}
fn get_u16_f(&mut self) -> Result<u16> {
if self.is_empty() {
bail!("no bytes left, expected 2");
}
Ok(self.get_u16())
}
fn get_u32_f(&mut self) -> Result<u32> {
if self.remaining() < 4 {
bail!("only {} bytes left, expected 4", self.remaining());
}
Ok(self.get_u32())
}
fn get_u64_f(&mut self) -> Result<u64> {
if self.remaining() < 8 {
bail!("only {} bytes left, expected 8", self.remaining());
}
Ok(self.get_u64())
}
}
impl ProposerAcceptorMessage {
/// Read cstring from Bytes.
fn get_cstr(buf: &mut Bytes) -> Result<String> {
let pos = buf
.iter()
.position(|x| *x == 0)
.ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?;
let result = buf.split_to(pos);
buf.advance(1); // drop the null terminator
match std::str::from_utf8(&result) {
Ok(s) => Ok(s.to_string()),
Err(e) => bail!("invalid utf8 in cstring: {}", e),
}
}
/// Read membership::Configuration from Bytes.
fn get_mconf(buf: &mut Bytes) -> Result<membership::Configuration> {
let generation = buf.get_u32_f().with_context(|| "reading generation")?;
let members_len = buf.get_u32_f().with_context(|| "reading members_len")?;
// Main member set must have at least someone in valid configuration.
// Empty conf is allowed until we fully migrate.
if generation != INVALID_GENERATION && members_len == 0 {
bail!("empty members_len");
}
let mut members = MemberSet::empty();
for i in 0..members_len {
let id = buf
.get_u64_f()
.with_context(|| format!("reading member {} node_id", i))?;
let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?;
let pg_port = buf
.get_u16_f()
.with_context(|| format!("reading member {} port", i))?;
let sk = SafekeeperId {
id: NodeId(id),
host,
pg_port,
};
members.add(sk)?;
}
let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?;
// Non joint conf.
if new_members_len == 0 {
Ok(membership::Configuration {
generation,
members,
new_members: None,
})
} else {
let mut new_members = MemberSet::empty();
for i in 0..new_members_len {
let id = buf
.get_u64_f()
.with_context(|| format!("reading new member {} node_id", i))?;
let host = Self::get_cstr(buf)
.with_context(|| format!("reading new member {} host", i))?;
let pg_port = buf
.get_u16_f()
.with_context(|| format!("reading new member {} port", i))?;
let sk = SafekeeperId {
id: NodeId(id),
host,
pg_port,
};
Ok(ProposerAcceptorMessage::Elected(msg))
new_members.add(sk)?;
}
'a' => {
// read header followed by wal data
let hdr = AppendRequestHeader::des_from(&mut stream)?;
let rec_size = hdr
.end_lsn
.checked_sub(hdr.begin_lsn)
.context("begin_lsn > end_lsn in AppendRequest")?
.0 as usize;
if rec_size > MAX_SEND_SIZE {
bail!(
"AppendRequest is longer than MAX_SEND_SIZE ({})",
MAX_SEND_SIZE
);
Ok(membership::Configuration {
generation,
members,
new_members: Some(new_members),
})
}
}
/// Parse proposer message.
pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
if proto_version == SK_PROTO_VERSION_3 {
if msg_bytes.is_empty() {
bail!("ProposerAcceptorMessage is not complete: missing tag");
}
let tag = msg_bytes.get_u8_f().with_context(|| {
"ProposerAcceptorMessage is not complete: missing tag".to_string()
})? as char;
match tag {
'g' => {
let tenant_id_str =
Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?;
let tenant_id = TenantId::from_str(&tenant_id_str)?;
let timeline_id_str =
Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?;
let timeline_id = TimelineId::from_str(&timeline_id_str)?;
let mconf = Self::get_mconf(&mut msg_bytes)?;
let pg_version = msg_bytes
.get_u32_f()
.with_context(|| "reading pg_version")?;
let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?;
let wal_seg_size = msg_bytes
.get_u32_f()
.with_context(|| "reading wal_seg_size")?;
let g = ProposerGreeting {
tenant_id,
timeline_id,
mconf,
pg_version,
system_id,
wal_seg_size,
};
Ok(ProposerAcceptorMessage::Greeting(g))
}
'v' => {
let generation = msg_bytes
.get_u32_f()
.with_context(|| "reading generation")?;
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
let v = VoteRequest { generation, term };
Ok(ProposerAcceptorMessage::VoteRequest(v))
}
'e' => {
let generation = msg_bytes
.get_u32_f()
.with_context(|| "reading generation")?;
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
let start_streaming_at: Lsn = msg_bytes
.get_u64_f()
.with_context(|| "reading start_streaming_at")?
.into();
let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
let msg = ProposerElected {
generation,
term,
start_streaming_at,
term_history,
};
Ok(ProposerAcceptorMessage::Elected(msg))
}
'a' => {
let generation = msg_bytes
.get_u32_f()
.with_context(|| "reading generation")?;
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
let begin_lsn: Lsn = msg_bytes
.get_u64_f()
.with_context(|| "reading begin_lsn")?
.into();
let end_lsn: Lsn = msg_bytes
.get_u64_f()
.with_context(|| "reading end_lsn")?
.into();
let commit_lsn: Lsn = msg_bytes
.get_u64_f()
.with_context(|| "reading commit_lsn")?
.into();
let truncate_lsn: Lsn = msg_bytes
.get_u64_f()
.with_context(|| "reading truncate_lsn")?
.into();
let hdr = AppendRequestHeader {
generation,
term,
begin_lsn,
end_lsn,
commit_lsn,
truncate_lsn,
};
let rec_size = hdr
.end_lsn
.checked_sub(hdr.begin_lsn)
.context("begin_lsn > end_lsn in AppendRequest")?
.0 as usize;
if rec_size > MAX_SEND_SIZE {
bail!(
"AppendRequest is longer than MAX_SEND_SIZE ({})",
MAX_SEND_SIZE
);
}
if msg_bytes.remaining() < rec_size {
bail!(
"reading WAL: only {} bytes left, wanted {}",
msg_bytes.remaining(),
rec_size
);
}
let wal_data = msg_bytes.copy_to_bytes(rec_size);
let msg = AppendRequest { h: hdr, wal_data };
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
stream.read_exact(&mut wal_data_vec)?;
let wal_data = Bytes::from(wal_data_vec);
let msg = AppendRequest { h: hdr, wal_data };
Ok(ProposerAcceptorMessage::AppendRequest(msg))
Ok(ProposerAcceptorMessage::AppendRequest(msg))
}
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
}
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
// TODO remove proto_version == 3 after converting all msgs
} else if proto_version == SK_PROTO_VERSION_2 || proto_version == SK_PROTO_VERSION_3 {
// xxx using Reader is inefficient but easy to work with bincode
let mut stream = msg_bytes.reader();
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
match tag {
'g' => {
let msgv2 = ProposerGreetingV2::des_from(&mut stream)?;
let g = ProposerGreeting {
tenant_id: msgv2.tenant_id,
timeline_id: msgv2.timeline_id,
mconf: membership::Configuration {
generation: INVALID_GENERATION,
members: MemberSet::empty(),
new_members: None,
},
pg_version: msgv2.pg_version,
system_id: msgv2.system_id,
wal_seg_size: msgv2.wal_seg_size,
};
Ok(ProposerAcceptorMessage::Greeting(g))
}
'v' => {
let msg = VoteRequestV2::des_from(&mut stream)?;
let v = VoteRequest {
generation: INVALID_GENERATION,
term: msg.term,
};
Ok(ProposerAcceptorMessage::VoteRequest(v))
}
'e' => {
let mut msg_bytes = stream.into_inner();
if msg_bytes.remaining() < 16 {
bail!("ProposerElected message is not complete");
}
let term = msg_bytes.get_u64_le();
let start_streaming_at = msg_bytes.get_u64_le().into();
let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?;
if msg_bytes.remaining() < 8 {
bail!("ProposerElected message is not complete");
}
let _timeline_start_lsn = msg_bytes.get_u64_le();
let msg = ProposerElected {
generation: INVALID_GENERATION,
term,
start_streaming_at,
term_history,
};
Ok(ProposerAcceptorMessage::Elected(msg))
}
'a' => {
// read header followed by wal data
let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?;
let hdr = AppendRequestHeader {
generation: INVALID_GENERATION,
term: hdrv2.term,
begin_lsn: hdrv2.begin_lsn,
end_lsn: hdrv2.end_lsn,
commit_lsn: hdrv2.commit_lsn,
truncate_lsn: hdrv2.truncate_lsn,
};
let rec_size = hdr
.end_lsn
.checked_sub(hdr.begin_lsn)
.context("begin_lsn > end_lsn in AppendRequest")?
.0 as usize;
if rec_size > MAX_SEND_SIZE {
bail!(
"AppendRequest is longer than MAX_SEND_SIZE ({})",
MAX_SEND_SIZE
);
}
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
stream.read_exact(&mut wal_data_vec)?;
let wal_data = Bytes::from(wal_data_vec);
let msg = AppendRequest { h: hdr, wal_data };
Ok(ProposerAcceptorMessage::AppendRequest(msg))
}
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
}
} else {
bail!("unsupported protocol version {}", proto_version);
}
}
@@ -394,36 +706,21 @@ impl ProposerAcceptorMessage {
// We explicitly list all fields, to draw attention here when new fields are added.
let mut size = BASE_SIZE;
size += match self {
Self::Greeting(ProposerGreeting {
protocol_version: _,
pg_version: _,
proposer_id: _,
system_id: _,
timeline_id: _,
tenant_id: _,
tli: _,
wal_seg_size: _,
}) => 0,
Self::Greeting(_) => 0,
Self::VoteRequest(VoteRequest { term: _ }) => 0,
Self::VoteRequest(_) => 0,
Self::Elected(ProposerElected {
term: _,
start_streaming_at: _,
term_history: _,
timeline_start_lsn: _,
}) => 0,
Self::Elected(_) => 0,
Self::AppendRequest(AppendRequest {
h:
AppendRequestHeader {
generation: _,
term: _,
term_start_lsn: _,
begin_lsn: _,
end_lsn: _,
commit_lsn: _,
truncate_lsn: _,
proposer_uuid: _,
},
wal_data,
}) => wal_data.len(),
@@ -431,13 +728,12 @@ impl ProposerAcceptorMessage {
Self::NoFlushAppendRequest(AppendRequest {
h:
AppendRequestHeader {
generation: _,
term: _,
term_start_lsn: _,
begin_lsn: _,
end_lsn: _,
commit_lsn: _,
truncate_lsn: _,
proposer_uuid: _,
},
wal_data,
}) => wal_data.len(),
@@ -458,45 +754,118 @@ pub enum AcceptorProposerMessage {
}
impl AcceptorProposerMessage {
/// Serialize acceptor -> proposer message.
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
match self {
AcceptorProposerMessage::Greeting(msg) => {
buf.put_u64_le('g' as u64);
buf.put_u64_le(msg.term);
buf.put_u64_le(msg.node_id.0);
}
AcceptorProposerMessage::VoteResponse(msg) => {
buf.put_u64_le('v' as u64);
buf.put_u64_le(msg.term);
buf.put_u64_le(msg.vote_given);
buf.put_u64_le(msg.flush_lsn.into());
buf.put_u64_le(msg.truncate_lsn.into());
buf.put_u32_le(msg.term_history.0.len() as u32);
for e in &msg.term_history.0 {
buf.put_u64_le(e.term);
buf.put_u64_le(e.lsn.into());
}
buf.put_u64_le(msg.timeline_start_lsn.into());
}
AcceptorProposerMessage::AppendResponse(msg) => {
buf.put_u64_le('a' as u64);
buf.put_u64_le(msg.term);
buf.put_u64_le(msg.flush_lsn.into());
buf.put_u64_le(msg.commit_lsn.into());
buf.put_i64_le(msg.hs_feedback.ts);
buf.put_u64_le(msg.hs_feedback.xmin);
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
fn put_cstr(buf: &mut BytesMut, s: &str) {
buf.put_slice(s.as_bytes());
buf.put_u8(0); // null terminator
}
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
// if it is not present.
if let Some(ref msg) = msg.pageserver_feedback {
msg.serialize(buf);
}
}
/// Serialize membership::Configuration into buf.
fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) {
buf.put_u32(mconf.generation);
buf.put_u32(mconf.members.m.len() as u32);
for sk in &mconf.members.m {
buf.put_u64(sk.id.0);
Self::put_cstr(buf, &sk.host);
buf.put_u16(sk.pg_port);
}
if let Some(ref new_members) = mconf.new_members {
buf.put_u32(new_members.m.len() as u32);
for sk in &new_members.m {
buf.put_u64(sk.id.0);
Self::put_cstr(buf, &sk.host);
buf.put_u16(sk.pg_port);
}
} else {
buf.put_u32(0);
}
}
Ok(())
/// Serialize acceptor -> proposer message.
pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> {
if proto_version == SK_PROTO_VERSION_3 {
match self {
AcceptorProposerMessage::Greeting(msg) => {
buf.put_u8('g' as u8);
buf.put_u64(msg.node_id.0);
Self::serialize_mconf(buf, &msg.mconf);
buf.put_u64(msg.term)
}
AcceptorProposerMessage::VoteResponse(msg) => {
buf.put_u8('v' as u8);
buf.put_u32(msg.generation);
buf.put_u64(msg.term);
buf.put_u8(msg.vote_given as u8);
buf.put_u64(msg.flush_lsn.into());
buf.put_u64(msg.truncate_lsn.into());
buf.put_u32(msg.term_history.0.len() as u32);
for e in &msg.term_history.0 {
buf.put_u64(e.term);
buf.put_u64(e.lsn.into());
}
}
AcceptorProposerMessage::AppendResponse(msg) => {
buf.put_u8('a' as u8);
buf.put_u32(msg.generation);
buf.put_u64(msg.term);
buf.put_u64(msg.flush_lsn.into());
buf.put_u64(msg.commit_lsn.into());
buf.put_i64(msg.hs_feedback.ts);
buf.put_u64(msg.hs_feedback.xmin);
buf.put_u64(msg.hs_feedback.catalog_xmin);
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
// if it is not present.
if let Some(ref msg) = msg.pageserver_feedback {
msg.serialize(buf);
}
}
}
Ok(())
// TODO remove 3 after converting all msgs
} else if proto_version == SK_PROTO_VERSION_2 {
match self {
AcceptorProposerMessage::Greeting(msg) => {
buf.put_u64_le('g' as u64);
// v2 didn't have mconf and fields were reordered
buf.put_u64_le(msg.term);
buf.put_u64_le(msg.node_id.0);
}
AcceptorProposerMessage::VoteResponse(msg) => {
// v2 didn't have generation, had u64 vote_given and timeline_start_lsn
buf.put_u64_le('v' as u64);
buf.put_u64_le(msg.term);
buf.put_u64_le(msg.vote_given as u64);
buf.put_u64_le(msg.flush_lsn.into());
buf.put_u64_le(msg.truncate_lsn.into());
buf.put_u32_le(msg.term_history.0.len() as u32);
for e in &msg.term_history.0 {
buf.put_u64_le(e.term);
buf.put_u64_le(e.lsn.into());
}
// removed timeline_start_lsn
buf.put_u64_le(0);
}
AcceptorProposerMessage::AppendResponse(msg) => {
// v2 didn't have generation
buf.put_u64_le('a' as u64);
buf.put_u64_le(msg.term);
buf.put_u64_le(msg.flush_lsn.into());
buf.put_u64_le(msg.commit_lsn.into());
buf.put_i64_le(msg.hs_feedback.ts);
buf.put_u64_le(msg.hs_feedback.xmin);
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
// if it is not present.
if let Some(ref msg) = msg.pageserver_feedback {
msg.serialize(buf);
}
}
}
Ok(())
} else {
bail!("unsupported protocol version {}", proto_version);
}
}
}
@@ -593,14 +962,6 @@ where
&mut self,
msg: &ProposerGreeting,
) -> Result<Option<AcceptorProposerMessage>> {
// Check protocol compatibility
if msg.protocol_version != SK_PROTOCOL_VERSION {
bail!(
"incompatible protocol version {}, expected {}",
msg.protocol_version,
SK_PROTOCOL_VERSION
);
}
/* Postgres major version mismatch is treated as fatal error
* because safekeepers parse WAL headers and the format
* may change between versions.
@@ -655,15 +1016,16 @@ where
self.state.finish_change(&state).await?;
}
info!(
"processed greeting from walproposer {}, sending term {:?}",
msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
self.state.acceptor_state.term
);
Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
term: self.state.acceptor_state.term,
let apg = AcceptorGreeting {
node_id: self.node_id,
})))
mconf: self.state.mconf.clone(),
term: self.state.acceptor_state.term,
};
info!(
"processed greeting {:?} from walproposer, sending {:?}",
msg, apg
);
Ok(Some(AcceptorProposerMessage::Greeting(apg)))
}
/// Give vote for the given term, if we haven't done that previously.
@@ -684,12 +1046,12 @@ where
self.wal_store.flush_wal().await?;
// initialize with refusal
let mut resp = VoteResponse {
generation: self.state.mconf.generation,
term: self.state.acceptor_state.term,
vote_given: false as u64,
vote_given: false,
flush_lsn: self.flush_lsn(),
truncate_lsn: self.state.inmem.peer_horizon_lsn,
term_history: self.get_term_history(),
timeline_start_lsn: self.state.timeline_start_lsn,
};
if self.state.acceptor_state.term < msg.term {
let mut state = self.state.start_change();
@@ -698,15 +1060,16 @@ where
self.state.finish_change(&state).await?;
resp.term = self.state.acceptor_state.term;
resp.vote_given = true as u64;
resp.vote_given = true;
}
info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
info!("processed {:?}: sending {:?}", msg, &resp);
Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
}
/// Form AppendResponse from current state.
fn append_response(&self) -> AppendResponse {
let ar = AppendResponse {
generation: self.state.mconf.generation,
term: self.state.acceptor_state.term,
flush_lsn: self.flush_lsn(),
commit_lsn: self.state.commit_lsn,
@@ -805,18 +1168,22 @@ where
// Here we learn initial LSN for the first time, set fields
// interested in that.
if state.timeline_start_lsn == Lsn(0) {
// Remember point where WAL begins globally.
state.timeline_start_lsn = msg.timeline_start_lsn;
info!(
"setting timeline_start_lsn to {:?}",
state.timeline_start_lsn
);
if let Some(start_lsn) = msg.term_history.0.first() {
if state.timeline_start_lsn == Lsn(0) {
// Remember point where WAL begins globally. In the future it
// will be intialized immediately on timeline creation.
state.timeline_start_lsn = start_lsn.lsn;
info!(
"setting timeline_start_lsn to {:?}",
state.timeline_start_lsn
);
}
}
if state.peer_horizon_lsn == Lsn(0) {
// Update peer_horizon_lsn as soon as we know where timeline starts.
// It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
state.peer_horizon_lsn = msg.timeline_start_lsn;
state.peer_horizon_lsn = state.timeline_start_lsn;
}
if state.local_start_lsn == Lsn(0) {
state.local_start_lsn = msg.start_streaming_at;
@@ -896,7 +1263,10 @@ where
// If our term is higher, immediately refuse the message.
if self.state.acceptor_state.term > msg.h.term {
let resp = AppendResponse::term_only(self.state.acceptor_state.term);
let resp = AppendResponse::term_only(
self.state.mconf.generation,
self.state.acceptor_state.term,
);
return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
}
@@ -924,10 +1294,8 @@ where
);
}
// Now we know that we are in the same term as the proposer,
// processing the message.
self.state.inmem.proposer_uuid = msg.h.proposer_uuid;
// Now we know that we are in the same term as the proposer, process the
// message.
// do the job
if !msg.wal_data.is_empty() {

View File

@@ -1,330 +1,100 @@
use std::collections::HashMap;
use std::fmt::Display;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, Context};
use futures::future::Either;
use anyhow::Context;
use futures::StreamExt;
use pageserver_api::shard::ShardIdentity;
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
use postgres_ffi::waldecoder::WalDecodeError;
use postgres_ffi::MAX_SEND_SIZE;
use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio::sync::mpsc::error::SendError;
use tokio::task::JoinHandle;
use tokio::time::MissedTickBehavior;
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use utils::postgres_client::Compression;
use utils::postgres_client::InterpretedFormat;
use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
use wal_decoder::wire_format::ToWireFormat;
use crate::metrics::WAL_READERS;
use crate::send_wal::{EndWatchView, WalSenderGuard};
use crate::timeline::WalResidentTimeline;
use crate::wal_reader_stream::{StreamingWalReader, WalBytes};
use crate::send_wal::EndWatchView;
use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
/// Identifier used to differentiate between senders of the same
/// shard.
///
/// In the steady state there's only one, but two pageservers may
/// temporarily have the same shard attached and attempt to ingest
/// WAL for it. See also [`ShardSenderId`].
#[derive(Hash, Eq, PartialEq, Copy, Clone)]
struct SenderId(u8);
impl SenderId {
fn first() -> Self {
SenderId(0)
}
fn next(&self) -> Self {
SenderId(self.0.checked_add(1).expect("few senders"))
}
/// Shard-aware interpreted record sender.
/// This is used for sending WAL to the pageserver. Said WAL
/// is pre-interpreted and filtered for the shard.
pub(crate) struct InterpretedWalSender<'a, IO> {
pub(crate) format: InterpretedFormat,
pub(crate) compression: Option<Compression>,
pub(crate) pgb: &'a mut PostgresBackend<IO>,
pub(crate) wal_stream_builder: WalReaderStreamBuilder,
pub(crate) end_watch_view: EndWatchView,
pub(crate) shard: ShardIdentity,
pub(crate) pg_version: u32,
pub(crate) appname: Option<String>,
}
#[derive(Hash, Eq, PartialEq)]
struct ShardSenderId {
shard: ShardIdentity,
sender_id: SenderId,
}
impl Display for ShardSenderId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug())
}
}
impl ShardSenderId {
fn new(shard: ShardIdentity, sender_id: SenderId) -> Self {
ShardSenderId { shard, sender_id }
}
fn shard(&self) -> ShardIdentity {
self.shard
}
}
/// Shard-aware fan-out interpreted record reader.
/// Reads WAL from disk, decodes it, intepretets it, and sends
/// it to any [`InterpretedWalSender`] connected to it.
/// Each [`InterpretedWalSender`] corresponds to one shard
/// and gets interpreted records concerning that shard only.
pub(crate) struct InterpretedWalReader {
wal_stream: StreamingWalReader,
shard_senders: HashMap<ShardIdentity, smallvec::SmallVec<[ShardSenderState; 1]>>,
shard_notification_rx: Option<tokio::sync::mpsc::UnboundedReceiver<AttachShardNotification>>,
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
pg_version: u32,
}
/// A handle for [`InterpretedWalReader`] which allows for interacting with it
/// when it runs as a separate tokio task.
#[derive(Debug)]
pub(crate) struct InterpretedWalReaderHandle {
join_handle: JoinHandle<Result<(), InterpretedWalReaderError>>,
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
shard_notification_tx: tokio::sync::mpsc::UnboundedSender<AttachShardNotification>,
}
struct ShardSenderState {
sender_id: SenderId,
tx: tokio::sync::mpsc::Sender<Batch>,
next_record_lsn: Lsn,
}
/// State of [`InterpretedWalReader`] visible outside of the task running it.
#[derive(Debug)]
pub(crate) enum InterpretedWalReaderState {
Running { current_position: Lsn },
Done,
}
pub(crate) struct Batch {
struct Batch {
wal_end_lsn: Lsn,
available_wal_end_lsn: Lsn,
records: InterpretedWalRecords,
}
#[derive(thiserror::Error, Debug)]
pub enum InterpretedWalReaderError {
/// Handler initiates the end of streaming.
#[error("decode error: {0}")]
Decode(#[from] WalDecodeError),
#[error("read or interpret error: {0}")]
ReadOrInterpret(#[from] anyhow::Error),
#[error("wal stream closed")]
WalStreamClosed,
}
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
/// Send interpreted WAL to a receiver.
/// Stops when an error occurs or the receiver is caught up and there's no active compute.
///
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
/// convenience.
pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
let mut wal_position = self.wal_stream_builder.start_pos();
let mut wal_decoder =
WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
impl InterpretedWalReaderState {
fn current_position(&self) -> Option<Lsn> {
match self {
InterpretedWalReaderState::Running {
current_position, ..
} => Some(*current_position),
InterpretedWalReaderState::Done => None,
}
}
}
let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
let mut stream = std::pin::pin!(stream);
pub(crate) struct AttachShardNotification {
shard_id: ShardIdentity,
sender: tokio::sync::mpsc::Sender<Batch>,
start_pos: Lsn,
}
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
keepalive_ticker.reset();
impl InterpretedWalReader {
/// Spawn the reader in a separate tokio task and return a handle
pub(crate) fn spawn(
wal_stream: StreamingWalReader,
start_pos: Lsn,
tx: tokio::sync::mpsc::Sender<Batch>,
shard: ShardIdentity,
pg_version: u32,
appname: &Option<String>,
) -> InterpretedWalReaderHandle {
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
current_position: start_pos,
}));
let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
let reader = InterpretedWalReader {
wal_stream,
shard_senders: HashMap::from([(
shard,
smallvec::smallvec![ShardSenderState {
sender_id: SenderId::first(),
tx,
next_record_lsn: start_pos,
}],
)]),
shard_notification_rx: Some(shard_notification_rx),
state: state.clone(),
pg_version,
};
let metric = WAL_READERS
.get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")])
.unwrap();
let join_handle = tokio::task::spawn(
async move {
metric.inc();
scopeguard::defer! {
metric.dec();
}
let res = reader.run_impl(start_pos).await;
if let Err(ref err) = res {
tracing::error!("Task finished with error: {err}");
}
res
}
.instrument(info_span!("interpreted wal reader")),
);
InterpretedWalReaderHandle {
join_handle,
state,
shard_notification_tx,
}
}
/// Construct the reader without spawning anything
/// Callers should drive the future returned by [`Self::run`].
pub(crate) fn new(
wal_stream: StreamingWalReader,
start_pos: Lsn,
tx: tokio::sync::mpsc::Sender<Batch>,
shard: ShardIdentity,
pg_version: u32,
) -> InterpretedWalReader {
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
current_position: start_pos,
}));
InterpretedWalReader {
wal_stream,
shard_senders: HashMap::from([(
shard,
smallvec::smallvec![ShardSenderState {
sender_id: SenderId::first(),
tx,
next_record_lsn: start_pos,
}],
)]),
shard_notification_rx: None,
state: state.clone(),
pg_version,
}
}
/// Entry point for future (polling) based wal reader.
pub(crate) async fn run(
self,
start_pos: Lsn,
appname: &Option<String>,
) -> Result<(), CopyStreamHandlerEnd> {
let metric = WAL_READERS
.get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")])
.unwrap();
metric.inc();
scopeguard::defer! {
metric.dec();
}
let res = self.run_impl(start_pos).await;
if let Err(err) = res {
tracing::error!("Interpreted wal reader encountered error: {err}");
} else {
tracing::info!("Interpreted wal reader exiting");
}
Err(CopyStreamHandlerEnd::Other(anyhow!(
"interpreted wal reader finished"
)))
}
/// Send interpreted WAL to one or more [`InterpretedWalSender`]s
/// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`]
/// goes out of scope.
async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> {
let defer_state = self.state.clone();
scopeguard::defer! {
*defer_state.write().unwrap() = InterpretedWalReaderState::Done;
}
let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
let shard = vec![self.shard];
loop {
tokio::select! {
// Main branch for reading WAL and forwarding it
wal_or_reset = self.wal_stream.next() => {
let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
let WalBytes {
wal,
wal_start_lsn: _,
wal_end_lsn,
available_wal_end_lsn,
} = match wal {
Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?,
None => {
// [`StreamingWalReader::next`] is an endless stream of WAL.
// It shouldn't ever finish unless it panicked or became internally
// inconsistent.
return Result::Err(InterpretedWalReaderError::WalStreamClosed);
}
// Get some WAL from the stream and then: decode, interpret and push it down the
// pipeline.
wal = stream.next(), if tx.capacity() > 0 => {
let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
Some(some) => some?,
None => { break; }
};
wal_position = wal_end_lsn;
wal_decoder.feed_bytes(&wal);
// Deserialize and interpret WAL records from this batch of WAL.
// Interpreted records for each shard are collected separately.
let shard_ids = self.shard_senders.keys().copied().collect::<Vec<_>>();
let mut records_by_sender: HashMap<ShardSenderId, Vec<InterpretedWalRecord>> = HashMap::new();
let mut records = Vec::new();
let mut max_next_record_lsn = None;
while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()?
while let Some((next_record_lsn, recdata)) = wal_decoder
.poll_decode()
.with_context(|| "Failed to decode WAL")?
{
assert!(next_record_lsn.is_aligned());
max_next_record_lsn = Some(next_record_lsn);
// Deserialize and interpret WAL record
let interpreted = InterpretedWalRecord::from_bytes_filtered(
recdata,
&shard_ids,
&shard,
next_record_lsn,
self.pg_version,
)
.with_context(|| "Failed to interpret WAL")?;
.with_context(|| "Failed to interpret WAL")?
.remove(&self.shard)
.unwrap();
for (shard, record) in interpreted {
if record.is_empty() {
continue;
}
let mut states_iter = self.shard_senders
.get(&shard)
.expect("keys collected above")
.iter()
.filter(|state| record.next_record_lsn > state.next_record_lsn)
.peekable();
while let Some(state) = states_iter.next() {
let shard_sender_id = ShardSenderId::new(shard, state.sender_id);
// The most commont case is one sender per shard. Peek and break to avoid the
// clone in that situation.
if states_iter.peek().is_none() {
records_by_sender.entry(shard_sender_id).or_default().push(record);
break;
} else {
records_by_sender.entry(shard_sender_id).or_default().push(record.clone());
}
}
if !interpreted.is_empty() {
records.push(interpreted);
}
}
@@ -333,169 +103,19 @@ impl InterpretedWalReader {
None => { continue; }
};
// Update the current position such that new receivers can decide
// whether to attach to us or spawn a new WAL reader.
match &mut *self.state.write().unwrap() {
InterpretedWalReaderState::Running { current_position, .. } => {
*current_position = max_next_record_lsn;
},
InterpretedWalReaderState::Done => {
unreachable!()
}
}
// Send interpreted records downstream. Anything that has already been seen
// by a shard is filtered out.
let mut shard_senders_to_remove = Vec::new();
for (shard, states) in &mut self.shard_senders {
for state in states {
if max_next_record_lsn <= state.next_record_lsn {
continue;
}
let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
let batch = InterpretedWalRecords {
records,
next_record_lsn: Some(max_next_record_lsn),
};
let res = state.tx.send(Batch {
wal_end_lsn,
available_wal_end_lsn,
records: batch,
}).await;
if res.is_err() {
shard_senders_to_remove.push(shard_sender_id);
} else {
state.next_record_lsn = max_next_record_lsn;
}
}
}
// Clean up any shard senders that have dropped out.
// This is inefficient, but such events are rare (connection to PS termination)
// and the number of subscriptions on the same shards very small (only one
// for the steady state).
for to_remove in shard_senders_to_remove {
let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above");
if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) {
shard_senders.remove(idx);
tracing::info!("Removed shard sender {}", to_remove);
}
if shard_senders.is_empty() {
self.shard_senders.remove(&to_remove.shard());
}
}
},
// Listen for new shards that want to attach to this reader.
// If the reader is not running as a task, then this is not supported
// (see the pending branch below).
notification = match self.shard_notification_rx.as_mut() {
Some(rx) => Either::Left(rx.recv()),
None => Either::Right(std::future::pending())
} => {
if let Some(n) = notification {
let AttachShardNotification { shard_id, sender, start_pos } = n;
// Update internal and external state, then reset the WAL stream
// if required.
let senders = self.shard_senders.entry(shard_id).or_default();
let new_sender_id = match senders.last() {
Some(sender) => sender.sender_id.next(),
None => SenderId::first()
};
senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
let current_pos = self.state.read().unwrap().current_position().unwrap();
if start_pos < current_pos {
self.wal_stream.reset(start_pos).await;
wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
}
tracing::info!(
"Added shard sender {} with start_pos={} current_pos={}",
ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
);
}
}
}
}
}
}
impl InterpretedWalReaderHandle {
/// Fan-out the reader by attaching a new shard to it
pub(crate) fn fanout(
&self,
shard_id: ShardIdentity,
sender: tokio::sync::mpsc::Sender<Batch>,
start_pos: Lsn,
) -> Result<(), SendError<AttachShardNotification>> {
self.shard_notification_tx.send(AttachShardNotification {
shard_id,
sender,
start_pos,
})
}
/// Get the current WAL position of the reader
pub(crate) fn current_position(&self) -> Option<Lsn> {
self.state.read().unwrap().current_position()
}
pub(crate) fn abort(&self) {
self.join_handle.abort()
}
}
impl Drop for InterpretedWalReaderHandle {
fn drop(&mut self) {
tracing::info!("Aborting interpreted wal reader");
self.abort()
}
}
pub(crate) struct InterpretedWalSender<'a, IO> {
pub(crate) format: InterpretedFormat,
pub(crate) compression: Option<Compression>,
pub(crate) appname: Option<String>,
pub(crate) tli: WalResidentTimeline,
pub(crate) start_lsn: Lsn,
pub(crate) pgb: &'a mut PostgresBackend<IO>,
pub(crate) end_watch_view: EndWatchView,
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
pub(crate) rx: tokio::sync::mpsc::Receiver<Batch>,
}
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
/// Send interpreted WAL records over the network.
/// Also manages keep-alives if nothing was sent for a while.
pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
keepalive_ticker.reset();
let mut wal_position = self.start_lsn;
loop {
tokio::select! {
batch = self.rx.recv() => {
let batch = match batch {
Some(b) => b,
None => {
return Result::Err(
CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early"))
);
}
let batch = InterpretedWalRecords {
records,
next_record_lsn: Some(max_next_record_lsn),
};
wal_position = batch.wal_end_lsn;
tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
},
// For a previously interpreted batch, serialize it and push it down the wire.
batch = rx.recv() => {
let batch = match batch {
Some(b) => b,
None => { break; }
};
let buf = batch
.records
@@ -516,21 +136,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
})).await?;
}
// Send a periodic keep alive when the connection has been idle for a while.
// Since we've been idle, also check if we can stop streaming.
_ = keepalive_ticker.tick() => {
if let Some(remote_consistent_lsn) = self.wal_sender_guard
.walsenders()
.get_ws_remote_consistent_lsn(self.wal_sender_guard.id())
{
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
// Stop streaming if the receivers are caught up and
// there's no active compute. This causes the loop in
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
// to exit and terminate the WAL stream.
break;
}
}
self.pgb
.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
wal_end: self.end_watch_view.get().0,
@@ -538,259 +144,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
request_reply: true,
}))
.await?;
},
}
}
}
// The loop above ends when the receiver is caught up and there's no more WAL to send.
Err(CopyStreamHandlerEnd::ServerInitiated(format!(
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
self.appname, wal_position,
)))
}
}
#[cfg(test)]
mod tests {
use std::{collections::HashMap, str::FromStr, time::Duration};
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
use postgres_ffi::MAX_SEND_SIZE;
use tokio::sync::mpsc::error::TryRecvError;
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,
shard::{ShardCount, ShardNumber},
};
use crate::{
send_interpreted_wal::{Batch, InterpretedWalReader},
test_utils::Env,
wal_reader_stream::StreamingWalReader,
};
#[tokio::test]
async fn test_interpreted_wal_reader_fanout() {
let _ = env_logger::builder().is_test(true).try_init();
const SIZE: usize = 8 * 1024;
const MSG_COUNT: usize = 200;
const PG_VERSION: u32 = 17;
const SHARD_COUNT: u8 = 2;
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
let env = Env::new(true).unwrap();
let tli = env
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
.await
.unwrap();
let resident_tli = tli.wal_residence_guard().await.unwrap();
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
.await
.unwrap();
let end_pos = end_watch.get();
tracing::info!("Doing first round of reads ...");
let streaming_wal_reader = StreamingWalReader::new(
resident_tli,
None,
start_lsn,
end_pos,
end_watch,
MAX_SEND_SIZE,
);
let shard_0 = ShardIdentity::new(
ShardNumber(0),
ShardCount(SHARD_COUNT),
ShardStripeSize::default(),
)
.unwrap();
let shard_1 = ShardIdentity::new(
ShardNumber(1),
ShardCount(SHARD_COUNT),
ShardStripeSize::default(),
)
.unwrap();
let mut shards = HashMap::new();
for shard_number in 0..SHARD_COUNT {
let shard_id = ShardIdentity::new(
ShardNumber(shard_number),
ShardCount(SHARD_COUNT),
ShardStripeSize::default(),
)
.unwrap();
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
shards.insert(shard_id, (Some(tx), Some(rx)));
}
let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap();
let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap();
let handle = InterpretedWalReader::spawn(
streaming_wal_reader,
start_lsn,
shard_0_tx,
shard_0,
PG_VERSION,
&Some("pageserver".to_string()),
);
tracing::info!("Reading all WAL with only shard 0 attached ...");
let mut shard_0_interpreted_records = Vec::new();
while let Some(batch) = shard_0_rx.recv().await {
shard_0_interpreted_records.push(batch.records);
if batch.wal_end_lsn == batch.available_wal_end_lsn {
break;
}
}
let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap();
let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap();
tracing::info!("Attaching shard 1 to the reader at start of WAL");
handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap();
tracing::info!("Reading all WAL with shard 0 and shard 1 attached ...");
let mut shard_1_interpreted_records = Vec::new();
while let Some(batch) = shard_1_rx.recv().await {
shard_1_interpreted_records.push(batch.records);
if batch.wal_end_lsn == batch.available_wal_end_lsn {
break;
}
}
// This test uses logical messages. Those only go to shard 0. Check that the
// filtering worked and shard 1 did not get any.
assert!(shard_1_interpreted_records
.iter()
.all(|recs| recs.records.is_empty()));
// Shard 0 should not receive anything more since the reader is
// going through wal that it has already processed.
let res = shard_0_rx.try_recv();
if let Ok(ref ok) = res {
tracing::error!(
"Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}",
ok.wal_end_lsn,
ok.available_wal_end_lsn
);
}
assert!(matches!(res, Err(TryRecvError::Empty)));
// Check that the next records lsns received by the two shards match up.
let shard_0_next_lsns = shard_0_interpreted_records
.iter()
.map(|recs| recs.next_record_lsn)
.collect::<Vec<_>>();
let shard_1_next_lsns = shard_1_interpreted_records
.iter()
.map(|recs| recs.next_record_lsn)
.collect::<Vec<_>>();
assert_eq!(shard_0_next_lsns, shard_1_next_lsns);
handle.abort();
let mut done = false;
for _ in 0..5 {
if handle.current_position().is_none() {
done = true;
break;
}
tokio::time::sleep(Duration::from_millis(1)).await;
}
assert!(done);
}
#[tokio::test]
async fn test_interpreted_wal_reader_same_shard_fanout() {
let _ = env_logger::builder().is_test(true).try_init();
const SIZE: usize = 8 * 1024;
const MSG_COUNT: usize = 200;
const PG_VERSION: u32 = 17;
const SHARD_COUNT: u8 = 2;
const ATTACHED_SHARDS: u8 = 4;
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
let env = Env::new(true).unwrap();
let tli = env
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
.await
.unwrap();
let resident_tli = tli.wal_residence_guard().await.unwrap();
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
.await
.unwrap();
let end_pos = end_watch.get();
let streaming_wal_reader = StreamingWalReader::new(
resident_tli,
None,
start_lsn,
end_pos,
end_watch,
MAX_SEND_SIZE,
);
let shard_0 = ShardIdentity::new(
ShardNumber(0),
ShardCount(SHARD_COUNT),
ShardStripeSize::default(),
)
.unwrap();
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
let mut batch_receivers = vec![rx];
let handle = InterpretedWalReader::spawn(
streaming_wal_reader,
start_lsn,
tx,
shard_0,
PG_VERSION,
&Some("pageserver".to_string()),
);
for _ in 0..(ATTACHED_SHARDS - 1) {
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
handle.fanout(shard_0, tx, start_lsn).unwrap();
batch_receivers.push(rx);
}
loop {
let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
for rx in batch_receivers.iter_mut().skip(1) {
let other_batch = rx.recv().await.unwrap();
assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
assert_eq!(
batch.available_wal_end_lsn,
other_batch.available_wal_end_lsn
);
}
if batch.wal_end_lsn == batch.available_wal_end_lsn {
break;
}
}
handle.abort();
let mut done = false;
for _ in 0..5 {
if handle.current_position().is_none() {
done = true;
break;
}
tokio::time::sleep(Duration::from_millis(1)).await;
}
assert!(done);
}
}

View File

@@ -2,18 +2,16 @@
//! with the "START_REPLICATION" message, and registry of walsenders.
use crate::handler::SafekeeperPostgresHandler;
use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS};
use crate::metrics::RECEIVED_PS_FEEDBACKS;
use crate::receive_wal::WalReceivers;
use crate::safekeeper::TermLsn;
use crate::send_interpreted_wal::{
Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender,
};
use crate::send_interpreted_wal::InterpretedWalSender;
use crate::timeline::WalResidentTimeline;
use crate::wal_reader_stream::StreamingWalReader;
use crate::wal_reader_stream::WalReaderStreamBuilder;
use crate::wal_storage::WalReader;
use anyhow::{bail, Context as AnyhowContext};
use bytes::Bytes;
use futures::FutureExt;
use futures::future::Either;
use parking_lot::Mutex;
use postgres_backend::PostgresBackend;
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
@@ -21,16 +19,16 @@ use postgres_ffi::get_current_timestamp;
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
use safekeeper_api::models::{
HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
INVALID_FULL_TRANSACTION_ID,
ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
WalSenderState, INVALID_FULL_TRANSACTION_ID,
};
use safekeeper_api::Term;
use tokio::io::{AsyncRead, AsyncWrite};
use utils::failpoint_support;
use utils::id::TenantTimelineId;
use utils::pageserver_feedback::PageserverFeedback;
use utils::postgres_client::PostgresClientProtocol;
use itertools::Itertools;
use std::cmp::{max, min};
use std::net::SocketAddr;
use std::sync::Arc;
@@ -52,12 +50,6 @@ pub struct WalSenders {
walreceivers: Arc<WalReceivers>,
}
pub struct WalSendersTimelineMetricValues {
pub ps_feedback_counter: u64,
pub last_ps_feedback: PageserverFeedback,
pub interpreted_wal_reader_tasks: usize,
}
impl WalSenders {
pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
Arc::new(WalSenders {
@@ -68,8 +60,21 @@ impl WalSenders {
/// Register new walsender. Returned guard provides access to the slot and
/// automatically deregisters in Drop.
fn register(self: &Arc<WalSenders>, walsender_state: WalSenderState) -> WalSenderGuard {
fn register(
self: &Arc<WalSenders>,
ttid: TenantTimelineId,
addr: SocketAddr,
conn_id: ConnectionId,
appname: Option<String>,
) -> WalSenderGuard {
let slots = &mut self.mutex.lock().slots;
let walsender_state = WalSenderState {
ttid,
addr,
conn_id,
appname,
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
};
// find empty slot or create new one
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
slots[pos] = Some(walsender_state);
@@ -85,79 +90,9 @@ impl WalSenders {
}
}
fn create_or_update_interpreted_reader<
FUp: FnOnce(&Arc<InterpretedWalReaderHandle>) -> anyhow::Result<()>,
FNew: FnOnce() -> InterpretedWalReaderHandle,
>(
self: &Arc<WalSenders>,
id: WalSenderId,
start_pos: Lsn,
max_delta_for_fanout: Option<u64>,
update: FUp,
create: FNew,
) -> anyhow::Result<()> {
let state = &mut self.mutex.lock();
let mut selected_interpreted_reader = None;
for slot in state.slots.iter().flatten() {
if let WalSenderState::Interpreted(slot_state) = slot {
if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader {
let select = match (interpreted_reader.current_position(), max_delta_for_fanout)
{
(Some(pos), Some(max_delta)) => {
let delta = pos.0.abs_diff(start_pos.0);
delta <= max_delta
}
// Reader is not active
(None, _) => false,
// Gating fanout by max delta is disabled.
// Attach to any active reader.
(_, None) => true,
};
if select {
selected_interpreted_reader = Some(interpreted_reader.clone());
break;
}
}
}
}
let slot = state.get_slot_mut(id);
let slot_state = match slot {
WalSenderState::Interpreted(s) => s,
WalSenderState::Vanilla(_) => unreachable!(),
};
let selected_or_new = match selected_interpreted_reader {
Some(selected) => {
update(&selected)?;
selected
}
None => Arc::new(create()),
};
slot_state.interpreted_wal_reader = Some(selected_or_new);
Ok(())
}
/// Get state of all walsenders.
pub fn get_all_public(self: &Arc<WalSenders>) -> Vec<safekeeper_api::models::WalSenderState> {
self.mutex
.lock()
.slots
.iter()
.flatten()
.map(|state| match state {
WalSenderState::Vanilla(s) => {
safekeeper_api::models::WalSenderState::Vanilla(s.clone())
}
WalSenderState::Interpreted(s) => {
safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone())
}
})
.collect()
pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
self.mutex.lock().slots.iter().flatten().cloned().collect()
}
/// Get LSN of the most lagging pageserver receiver. Return None if there are no
@@ -168,7 +103,7 @@ impl WalSenders {
.slots
.iter()
.flatten()
.filter_map(|s| match s.get_feedback() {
.filter_map(|s| match s.feedback {
ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
ReplicationFeedback::Standby(_) => None,
})
@@ -176,25 +111,9 @@ impl WalSenders {
}
/// Returns total counter of pageserver feedbacks received and last feedback.
pub fn info_for_metrics(self: &Arc<WalSenders>) -> WalSendersTimelineMetricValues {
pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
let shared = self.mutex.lock();
let interpreted_wal_reader_tasks = shared
.slots
.iter()
.filter_map(|ss| match ss {
Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(),
Some(WalSenderState::Vanilla(_)) => None,
None => None,
})
.unique_by(|reader| Arc::as_ptr(reader))
.count();
WalSendersTimelineMetricValues {
ps_feedback_counter: shared.ps_feedback_counter,
last_ps_feedback: shared.last_ps_feedback,
interpreted_wal_reader_tasks,
}
(shared.ps_feedback_counter, shared.last_ps_feedback)
}
/// Get aggregated hot standby feedback (we send it to compute).
@@ -205,7 +124,7 @@ impl WalSenders {
/// Record new pageserver feedback, update aggregated values.
fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
let mut shared = self.mutex.lock();
*shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback);
shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
shared.last_ps_feedback = *feedback;
shared.ps_feedback_counter += 1;
drop(shared);
@@ -224,10 +143,10 @@ impl WalSenders {
"Record standby reply: ts={} apply_lsn={}",
reply.reply_ts, reply.apply_lsn
);
match &mut slot.get_mut_feedback() {
match &mut slot.feedback {
ReplicationFeedback::Standby(sf) => sf.reply = *reply,
ReplicationFeedback::Pageserver(_) => {
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
reply: *reply,
hs_feedback: HotStandbyFeedback::empty(),
})
@@ -239,10 +158,10 @@ impl WalSenders {
fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
let mut shared = self.mutex.lock();
let slot = shared.get_slot_mut(id);
match &mut slot.get_mut_feedback() {
match &mut slot.feedback {
ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
ReplicationFeedback::Pageserver(_) => {
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
reply: StandbyReply::empty(),
hs_feedback: *feedback,
})
@@ -256,7 +175,7 @@ impl WalSenders {
pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
let shared = self.mutex.lock();
let slot = shared.get_slot(id);
match slot.get_feedback() {
match slot.feedback {
ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
_ => None,
}
@@ -280,47 +199,6 @@ struct WalSendersShared {
slots: Vec<Option<WalSenderState>>,
}
/// Safekeeper internal definitions of wal sender state
///
/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may
/// include state that we don not wish to expose to the public api.
#[derive(Debug, Clone)]
pub(crate) enum WalSenderState {
Vanilla(VanillaWalSenderInternalState),
Interpreted(InterpretedWalSenderInternalState),
}
type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState;
#[derive(Debug, Clone)]
pub(crate) struct InterpretedWalSenderInternalState {
public_state: safekeeper_api::models::InterpretedWalSenderState,
interpreted_wal_reader: Option<Arc<InterpretedWalReaderHandle>>,
}
impl WalSenderState {
fn get_addr(&self) -> &SocketAddr {
match self {
WalSenderState::Vanilla(state) => &state.addr,
WalSenderState::Interpreted(state) => &state.public_state.addr,
}
}
fn get_feedback(&self) -> &ReplicationFeedback {
match self {
WalSenderState::Vanilla(state) => &state.feedback,
WalSenderState::Interpreted(state) => &state.public_state.feedback,
}
}
fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback {
match self {
WalSenderState::Vanilla(state) => &mut state.feedback,
WalSenderState::Interpreted(state) => &mut state.public_state.feedback,
}
}
}
impl WalSendersShared {
fn new() -> Self {
WalSendersShared {
@@ -347,7 +225,7 @@ impl WalSendersShared {
let mut agg = HotStandbyFeedback::empty();
let mut reply_agg = StandbyReply::empty();
for ws_state in self.slots.iter().flatten() {
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() {
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
let hs_feedback = standby_feedback.hs_feedback;
// doing Option math like op1.iter().chain(op2.iter()).min()
// would be nicer, but we serialize/deserialize this struct
@@ -439,7 +317,7 @@ impl SafekeeperPostgresHandler {
/// Wrapper around handle_start_replication_guts handling result. Error is
/// handled here while we're still in walsender ttid span; with API
/// extension, this can probably be moved into postgres_backend.
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin + Send>(
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
&mut self,
pgb: &mut PostgresBackend<IO>,
start_pos: Lsn,
@@ -464,7 +342,7 @@ impl SafekeeperPostgresHandler {
Ok(())
}
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin + Send>(
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
&mut self,
pgb: &mut PostgresBackend<IO>,
start_pos: Lsn,
@@ -474,30 +352,12 @@ impl SafekeeperPostgresHandler {
let appname = self.appname.clone();
// Use a guard object to remove our entry from the timeline when we are done.
let ws_guard = match self.protocol() {
PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register(
WalSenderState::Vanilla(VanillaWalSenderInternalState {
ttid: self.ttid,
addr: *pgb.get_peer_addr(),
conn_id: self.conn_id,
appname: self.appname.clone(),
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
}),
)),
PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register(
WalSenderState::Interpreted(InterpretedWalSenderInternalState {
public_state: safekeeper_api::models::InterpretedWalSenderState {
ttid: self.ttid,
shard: self.shard.unwrap(),
addr: *pgb.get_peer_addr(),
conn_id: self.conn_id,
appname: self.appname.clone(),
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
},
interpreted_wal_reader: None,
}),
)),
};
let ws_guard = Arc::new(tli.get_walsenders().register(
self.ttid,
*pgb.get_peer_addr(),
self.conn_id,
self.appname.clone(),
));
// Walsender can operate in one of two modes which we select by
// application_name: give only committed WAL (used by pageserver) or all
@@ -543,7 +403,7 @@ impl SafekeeperPostgresHandler {
pgb,
// should succeed since we're already holding another guard
tli: tli.wal_residence_guard().await?,
appname: appname.clone(),
appname,
start_pos,
end_pos,
term,
@@ -553,7 +413,7 @@ impl SafekeeperPostgresHandler {
send_buf: vec![0u8; MAX_SEND_SIZE],
};
FutureExt::boxed(sender.run())
Either::Left(sender.run())
}
PostgresClientProtocol::Interpreted {
format,
@@ -561,96 +421,27 @@ impl SafekeeperPostgresHandler {
} => {
let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
let end_watch_view = end_watch.view();
let wal_residence_guard = tli.wal_residence_guard().await?;
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(2);
let shard = self.shard.unwrap();
let wal_stream_builder = WalReaderStreamBuilder {
tli: tli.wal_residence_guard().await?,
start_pos,
end_pos,
term,
end_watch,
wal_sender_guard: ws_guard.clone(),
};
if self.conf.wal_reader_fanout && !shard.is_unsharded() {
let ws_id = ws_guard.id();
ws_guard.walsenders().create_or_update_interpreted_reader(
ws_id,
start_pos,
self.conf.max_delta_for_fanout,
{
let tx = tx.clone();
|reader| {
tracing::info!(
"Fanning out interpreted wal reader at {}",
start_pos
);
reader
.fanout(shard, tx, start_pos)
.with_context(|| "Failed to fan out reader")
}
},
|| {
tracing::info!("Spawning interpreted wal reader at {}", start_pos);
let sender = InterpretedWalSender {
format,
compression,
pgb,
wal_stream_builder,
end_watch_view,
shard: self.shard.unwrap(),
pg_version,
appname,
};
let wal_stream = StreamingWalReader::new(
wal_residence_guard,
term,
start_pos,
end_pos,
end_watch,
MAX_SEND_SIZE,
);
InterpretedWalReader::spawn(
wal_stream, start_pos, tx, shard, pg_version, &appname,
)
},
)?;
let sender = InterpretedWalSender {
format,
compression,
appname,
tli: tli.wal_residence_guard().await?,
start_lsn: start_pos,
pgb,
end_watch_view,
wal_sender_guard: ws_guard.clone(),
rx,
};
FutureExt::boxed(sender.run())
} else {
let wal_reader = StreamingWalReader::new(
wal_residence_guard,
term,
start_pos,
end_pos,
end_watch,
MAX_SEND_SIZE,
);
let reader =
InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version);
let sender = InterpretedWalSender {
format,
compression,
appname: appname.clone(),
tli: tli.wal_residence_guard().await?,
start_lsn: start_pos,
pgb,
end_watch_view,
wal_sender_guard: ws_guard.clone(),
rx,
};
FutureExt::boxed(async move {
// Sender returns an Err on all code paths.
// If the sender finishes first, we will drop the reader future.
// If the reader finishes first, the sender will finish too since
// the wal sender has dropped.
let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname));
match res.map(|_| ()) {
Ok(_) => unreachable!("sender finishes with Err by convention"),
err_res => err_res,
}
})
}
Either::Right(sender.run())
}
};
@@ -679,8 +470,7 @@ impl SafekeeperPostgresHandler {
.clone();
info!(
"finished streaming to {}, feedback={:?}",
ws_state.get_addr(),
ws_state.get_feedback(),
ws_state.addr, ws_state.feedback,
);
// Join pg backend back.
@@ -788,18 +578,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
/// convenience.
async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
let metric = WAL_READERS
.get_metric_with_label_values(&[
"future",
self.appname.as_deref().unwrap_or("safekeeper"),
])
.unwrap();
metric.inc();
scopeguard::defer! {
metric.dec();
}
loop {
// Wait for the next portion if it is not there yet, or just
// update our end of WAL available for sending value, we
@@ -1035,7 +813,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
#[cfg(test)]
mod tests {
use safekeeper_api::models::FullTransactionId;
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use utils::id::{TenantId, TimelineId};
use super::*;
@@ -1052,13 +830,13 @@ mod tests {
// add to wss specified feedback setting other fields to dummy values
fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState {
let walsender_state = WalSenderState {
ttid: mock_ttid(),
addr: mock_addr(),
conn_id: 1,
appname: None,
feedback,
});
};
wss.slots.push(Some(walsender_state))
}

View File

@@ -1,19 +1,13 @@
use std::sync::Arc;
use crate::rate_limit::RateLimiter;
use crate::receive_wal::WalAcceptor;
use crate::safekeeper::{
AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
ProposerElected, SafeKeeper, TermHistory,
};
use crate::send_wal::EndWatch;
use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
use crate::state::{TimelinePersistentState, TimelineState};
use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
use crate::timelines_set::TimelinesSet;
use crate::wal_backup::remote_timeline_path;
use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
use crate::{control_file, wal_storage, SafeKeeperConf};
use camino_tempfile::Utf8TempDir;
use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
use tokio::fs::create_dir_all;
use utils::id::{NodeId, TenantTimelineId};
use utils::lsn::Lsn;
@@ -113,59 +107,4 @@ impl Env {
);
Ok(timeline)
}
// This will be dead code when building a non-benchmark target with the
// benchmarking feature enabled.
#[allow(dead_code)]
pub(crate) async fn write_wal(
tli: Arc<Timeline>,
start_lsn: Lsn,
msg_size: usize,
msg_count: usize,
) -> anyhow::Result<EndWatch> {
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx());
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
let prefix = c"p";
let prefixlen = prefix.to_bytes_with_nul().len();
assert!(msg_size >= prefixlen);
let message = vec![0; msg_size - prefixlen];
let walgen =
&mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
for _ in 0..msg_count {
let (lsn, record) = walgen.next().unwrap();
let req = AppendRequest {
h: AppendRequestHeader {
term: 1,
term_start_lsn: start_lsn,
begin_lsn: lsn,
end_lsn: lsn + record.len() as u64,
commit_lsn: lsn,
truncate_lsn: Lsn(0),
proposer_uuid: [0; 16],
},
wal_data: record,
};
let end_lsn = req.h.end_lsn;
let msg = ProposerAcceptorMessage::AppendRequest(req);
msg_tx.send(msg).await?;
while let Some(reply) = reply_rx.recv().await {
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
if resp.flush_lsn >= end_lsn {
break;
}
}
}
}
Ok(end_watch)
}
}

View File

@@ -35,7 +35,7 @@ use crate::control_file;
use crate::rate_limit::RateLimiter;
use crate::receive_wal::WalReceivers;
use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues};
use crate::send_wal::WalSenders;
use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
use crate::timeline_guard::ResidenceGuard;
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
@@ -712,22 +712,16 @@ impl Timeline {
return None;
}
let WalSendersTimelineMetricValues {
ps_feedback_counter,
last_ps_feedback,
interpreted_wal_reader_tasks,
} = self.walsenders.info_for_metrics();
let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
let state = self.read_shared_state().await;
Some(FullTimelineInfo {
ttid: self.ttid,
ps_feedback_count: ps_feedback_counter,
ps_feedback_count,
last_ps_feedback,
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
num_computes: self.walreceivers.get_num() as u32,
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
interpreted_wal_reader_tasks,
epoch_start_lsn: state.sk.term_start_lsn(),
mem_state: state.sk.state().inmem.clone(),
persisted_state: TimelinePersistentState::clone(state.sk.state()),
@@ -746,7 +740,7 @@ impl Timeline {
debug_dump::Memory {
is_cancelled: self.is_cancelled(),
peers_info_len: state.peers_info.0.len(),
walsenders: self.walsenders.get_all_public(),
walsenders: self.walsenders.get_all(),
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
active: self.broker_active.load(Ordering::Relaxed),
num_computes: self.walreceivers.get_num() as u32,

View File

@@ -13,7 +13,6 @@ use anyhow::{bail, Context, Result};
use camino::Utf8PathBuf;
use camino_tempfile::Utf8TempDir;
use safekeeper_api::membership::Configuration;
use safekeeper_api::models::SafekeeperUtilization;
use safekeeper_api::ServerInfo;
use serde::Serialize;
use std::collections::HashMap;
@@ -417,20 +416,6 @@ impl GlobalTimelines {
.collect()
}
/// Returns statistics about timeline counts
pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
let global_lock = self.state.lock().unwrap();
let timeline_count = global_lock
.timelines
.values()
.filter(|t| match t {
GlobalMapTimeline::CreationInProgress => false,
GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
})
.count() as u64;
SafekeeperUtilization { timeline_count }
}
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
/// and that's why it can return cancelled timelines, to retry deleting them.
fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {

View File

@@ -1,16 +1,34 @@
use std::{
pin::Pin,
task::{Context, Poll},
};
use std::sync::Arc;
use async_stream::try_stream;
use bytes::Bytes;
use futures::{stream::BoxStream, Stream, StreamExt};
use futures::Stream;
use postgres_backend::CopyStreamHandlerEnd;
use safekeeper_api::Term;
use std::time::Duration;
use tokio::time::timeout;
use utils::lsn::Lsn;
use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader};
use safekeeper_api::Term;
use crate::{
send_wal::{EndWatch, WalSenderGuard},
timeline::WalResidentTimeline,
};
pub(crate) struct WalReaderStreamBuilder {
pub(crate) tli: WalResidentTimeline,
pub(crate) start_pos: Lsn,
pub(crate) end_pos: Lsn,
pub(crate) term: Option<Term>,
pub(crate) end_watch: EndWatch,
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
}
impl WalReaderStreamBuilder {
pub(crate) fn start_pos(&self) -> Lsn {
self.start_pos
}
}
#[derive(PartialEq, Eq, Debug)]
pub(crate) struct WalBytes {
/// Raw PG WAL
pub(crate) wal: Bytes,
@@ -26,270 +44,106 @@ pub(crate) struct WalBytes {
pub(crate) available_wal_end_lsn: Lsn,
}
struct PositionedWalReader {
start: Lsn,
end: Lsn,
reader: Option<WalReader>,
}
/// A streaming WAL reader wrapper which can be reset while running
pub(crate) struct StreamingWalReader {
stream: BoxStream<'static, WalOrReset>,
start_changed_tx: tokio::sync::watch::Sender<Lsn>,
}
pub(crate) enum WalOrReset {
Wal(anyhow::Result<WalBytes>),
Reset(Lsn),
}
impl WalOrReset {
pub(crate) fn get_wal(self) -> Option<anyhow::Result<WalBytes>> {
match self {
WalOrReset::Wal(wal) => Some(wal),
WalOrReset::Reset(_) => None,
}
}
}
impl StreamingWalReader {
pub(crate) fn new(
tli: WalResidentTimeline,
term: Option<Term>,
start: Lsn,
end: Lsn,
end_watch: EndWatch,
impl WalReaderStreamBuilder {
/// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
/// The stream terminates when the receiver (pageserver) is fully caught up
/// and there's no active computes.
pub(crate) async fn build(
self,
buffer_size: usize,
) -> Self {
let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start);
let state = WalReaderStreamState {
) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
// TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
// We can make the raw WAL sender use this stream too and remove the duplication.
let Self {
tli,
wal_reader: PositionedWalReader {
start,
end,
reader: None,
},
mut start_pos,
mut end_pos,
term,
end_watch,
buffer: vec![0; buffer_size],
buffer_size,
};
mut end_watch,
wal_sender_guard,
} = self;
let mut wal_reader = tli.get_walreader(start_pos).await?;
let mut buffer = vec![0; buffer_size];
// When a change notification is received while polling the internal
// reader, stop polling the read future and service the change.
let stream = futures::stream::unfold(
(state, start_changed_rx),
|(mut state, mut rx)| async move {
let wal_or_reset = tokio::select! {
read_res = state.read() => { WalOrReset::Wal(read_res) },
changed_res = rx.changed() => {
if changed_res.is_err() {
return None;
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
Ok(try_stream! {
loop {
let have_something_to_send = end_pos > start_pos;
if !have_something_to_send {
// wait for lsn
let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
match res {
Ok(ok) => {
end_pos = ok?;
},
Err(_) => {
if let EndWatch::Commit(_) = end_watch {
if let Some(remote_consistent_lsn) = wal_sender_guard
.walsenders()
.get_ws_remote_consistent_lsn(wal_sender_guard.id())
{
if tli.should_walsender_stop(remote_consistent_lsn).await {
// Stop streaming if the receivers are caught up and
// there's no active compute. This causes the loop in
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
// to exit and terminate the WAL stream.
return;
}
}
}
continue;
}
let new_start_pos = rx.borrow_and_update();
WalOrReset::Reset(*new_start_pos)
}
}
assert!(
end_pos > start_pos,
"nothing to send after waiting for WAL"
);
// try to send as much as available, capped by the buffer size
let mut chunk_end_pos = start_pos + buffer_size as u64;
// if we went behind available WAL, back off
if chunk_end_pos >= end_pos {
chunk_end_pos = end_pos;
} else {
// If sending not up to end pos, round down to page boundary to
// avoid breaking WAL record not at page boundary, as protocol
// demands. See walsender.c (XLogSendPhysical).
chunk_end_pos = chunk_end_pos
.checked_sub(chunk_end_pos.block_offset())
.unwrap();
}
let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
let buffer = &mut buffer[..send_size];
let send_size: usize;
{
// If uncommitted part is being pulled, check that the term is
// still the expected one.
let _term_guard = if let Some(t) = term {
Some(tli.acquire_term(t).await?)
} else {
None
};
// Read WAL into buffer. send_size can be additionally capped to
// segment boundary here.
send_size = wal_reader.read(buffer).await?
};
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
yield WalBytes {
wal,
wal_start_lsn: start_pos,
wal_end_lsn: start_pos + send_size as u64,
available_wal_end_lsn: end_pos
};
if let WalOrReset::Reset(lsn) = wal_or_reset {
state.wal_reader.start = lsn;
state.wal_reader.reader = None;
}
Some((wal_or_reset, (state, rx)))
},
)
.boxed();
Self {
stream,
start_changed_tx,
}
}
/// Reset the stream to a given position.
pub(crate) async fn reset(&mut self, start: Lsn) {
self.start_changed_tx.send(start).unwrap();
while let Some(wal_or_reset) = self.stream.next().await {
match wal_or_reset {
WalOrReset::Reset(at) => {
// Stream confirmed the reset.
// There may only one ongoing reset at any given time,
// hence the assertion.
assert_eq!(at, start);
break;
}
WalOrReset::Wal(_) => {
// Ignore wal generated before reset was handled
}
start_pos += send_size as u64;
}
}
}
}
impl Stream for StreamingWalReader {
type Item = WalOrReset;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
Pin::new(&mut self.stream).poll_next(cx)
}
}
struct WalReaderStreamState {
tli: WalResidentTimeline,
wal_reader: PositionedWalReader,
term: Option<Term>,
end_watch: EndWatch,
buffer: Vec<u8>,
buffer_size: usize,
}
impl WalReaderStreamState {
async fn read(&mut self) -> anyhow::Result<WalBytes> {
// Create reader if needed
if self.wal_reader.reader.is_none() {
self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?);
}
let have_something_to_send = self.wal_reader.end > self.wal_reader.start;
if !have_something_to_send {
tracing::debug!(
"Waiting for wal: start={}, end={}",
self.wal_reader.end,
self.wal_reader.start
);
self.wal_reader.end = self
.end_watch
.wait_for_lsn(self.wal_reader.start, self.term)
.await?;
tracing::debug!(
"Done waiting for wal: start={}, end={}",
self.wal_reader.end,
self.wal_reader.start
);
}
assert!(
self.wal_reader.end > self.wal_reader.start,
"nothing to send after waiting for WAL"
);
// Calculate chunk size
let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64;
if chunk_end_pos >= self.wal_reader.end {
chunk_end_pos = self.wal_reader.end;
} else {
chunk_end_pos = chunk_end_pos
.checked_sub(chunk_end_pos.block_offset())
.unwrap();
}
let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize;
let buffer = &mut self.buffer[..send_size];
// Read WAL
let send_size = {
let _term_guard = if let Some(t) = self.term {
Some(self.tli.acquire_term(t).await?)
} else {
None
};
self.wal_reader
.reader
.as_mut()
.unwrap()
.read(buffer)
.await?
};
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
let result = WalBytes {
wal,
wal_start_lsn: self.wal_reader.start,
wal_end_lsn: self.wal_reader.start + send_size as u64,
available_wal_end_lsn: self.wal_reader.end,
};
self.wal_reader.start += send_size as u64;
Ok(result)
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use futures::StreamExt;
use postgres_ffi::MAX_SEND_SIZE;
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,
};
use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader};
#[tokio::test]
async fn test_streaming_wal_reader_reset() {
let _ = env_logger::builder().is_test(true).try_init();
const SIZE: usize = 8 * 1024;
const MSG_COUNT: usize = 200;
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
let env = Env::new(true).unwrap();
let tli = env
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
.await
.unwrap();
let resident_tli = tli.wal_residence_guard().await.unwrap();
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
.await
.unwrap();
let end_pos = end_watch.get();
tracing::info!("Doing first round of reads ...");
let mut streaming_wal_reader = StreamingWalReader::new(
resident_tli,
None,
start_lsn,
end_pos,
end_watch,
MAX_SEND_SIZE,
);
let mut before_reset = Vec::new();
while let Some(wor) = streaming_wal_reader.next().await {
let wal = wor.get_wal().unwrap().unwrap();
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
before_reset.push(wal);
if stop {
break;
}
}
tracing::info!("Resetting the WAL stream ...");
streaming_wal_reader.reset(start_lsn).await;
tracing::info!("Doing second round of reads ...");
let mut after_reset = Vec::new();
while let Some(wor) = streaming_wal_reader.next().await {
let wal = wor.get_wal().unwrap().unwrap();
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
after_reset.push(wal);
if stop {
break;
}
}
assert_eq!(before_reset, after_reset);
})
}
}

View File

@@ -180,8 +180,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
control_file_save_interval: Duration::from_secs(1),
partial_backup_concurrency: 1,
eviction_min_resident: Duration::ZERO,
wal_reader_fanout: false,
max_delta_for_fanout: None,
};
let mut global = GlobalMap::new(disk, conf.clone())?;

View File

@@ -1,2 +0,0 @@
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled';
UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause';

View File

@@ -1,2 +0,0 @@
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause';
UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled';

View File

@@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::controller_api::{
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
ShardsPreferredAzsRequest, TenantCreateRequest,
};
use pageserver_api::models::{
TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
@@ -1305,35 +1305,6 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
.unwrap())
}
/// Sets the scheduling policy of the specified safekeeper
async fn handle_safekeeper_scheduling_policy(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let body = json_request::<SafekeeperSchedulingPolicyRequest>(&mut req).await?;
let id = parse_request_param::<i64>(&req, "id")?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let state = get_state(&req);
state
.service
.set_safekeeper_scheduling_policy(id, body.scheduling_policy)
.await?;
Ok(Response::builder()
.status(StatusCode::NO_CONTENT)
.body(Body::empty())
.unwrap())
}
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
/// be allowed to run if Service has finished its initial reconciliation.
async fn tenant_service_handler<R, H>(
@@ -1902,18 +1873,7 @@ pub fn make_router(
})
.post("/control/v1/safekeeper/:id", |r| {
// id is in the body
named_request_span(
r,
handle_upsert_safekeeper,
RequestName("v1_safekeeper_post"),
)
})
.post("/control/v1/safekeeper/:id/scheduling_policy", |r| {
named_request_span(
r,
handle_safekeeper_scheduling_policy,
RequestName("v1_safekeeper_status"),
)
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
})
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {

View File

@@ -1104,37 +1104,6 @@ impl Persistence {
})
.await
}
pub(crate) async fn set_safekeeper_scheduling_policy(
&self,
id_: i64,
scheduling_policy_: SkSchedulingPolicy,
) -> Result<(), DatabaseError> {
use crate::schema::safekeepers::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
#[derive(Insertable, AsChangeset)]
#[diesel(table_name = crate::schema::safekeepers)]
struct UpdateSkSchedulingPolicy<'a> {
id: i64,
scheduling_policy: &'a str,
}
let scheduling_policy_ = String::from(scheduling_policy_);
let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
.set(scheduling_policy.eq(scheduling_policy_))
.execute(conn)?;
if rows_affected != 1 {
return Err(DatabaseError::Logical(format!(
"unexpected number of rows ({rows_affected})",
)));
}
Ok(())
})
.await
}
}
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably

View File

@@ -47,7 +47,7 @@ use pageserver_api::{
AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse,
@@ -5411,15 +5411,6 @@ impl Service {
expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
// Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig`
// definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new
// fields are added, before doing a comparison.
for tsp in &mut persistent_shards {
let config: TenantConfig = serde_json::from_str(&tsp.config)
.map_err(|e| ApiError::InternalServerError(e.into()))?;
tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible");
}
if persistent_shards != expect_shards {
tracing::error!("Consistency check failed on shards.");
@@ -7279,14 +7270,19 @@ impl Service {
Ok(())
}
/// Create a node fill plan (pick secondaries to promote), based on:
/// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
/// outside their home AZ, should be migrated back here.
/// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of
/// attached shards, we will promote more shards from the nodes with the most attached shards, unless
/// those shards have a home AZ that doesn't match the node we're filling.
/// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
/// 1. The node should be filled until it reaches the expected cluster average of
/// attached shards. If there are not enough secondaries on the node, the plan stops early.
/// 2. Select tenant shards to promote such that the number of attached shards is balanced
/// throughout the cluster. We achieve this by picking tenant shards from each node,
/// starting from the ones with the largest number of attached shards, until the node
/// reaches the expected cluster average.
/// 3. Avoid promoting more shards of the same tenant than required. The upper bound
/// for the number of tenants from the same shard promoted to the node being filled is:
/// shard count for the tenant divided by the number of nodes in the cluster.
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
let mut locked = self.inner.write().unwrap();
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
let (nodes, tenants, _scheduler) = locked.parts_mut();
let node_az = nodes
@@ -7295,79 +7291,53 @@ impl Service {
.get_availability_zone_id()
.clone();
// The tenant shard IDs that we plan to promote from secondary to attached on this node
let mut plan = Vec::new();
let mut tids_by_node = tenants
.iter_mut()
.filter_map(|(tid, tenant_shard)| {
if !matches!(
tenant_shard.get_scheduling_policy(),
ShardSchedulingPolicy::Active
) {
// Only include tenants in fills if they have a normal (Active) scheduling policy. We
// even exclude Essential, because moving to fill a node is not essential to keeping this
// tenant available.
return None;
}
// Collect shards which do not have a preferred AZ & are elegible for moving in stage 2
let mut free_tids_by_node: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
// Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could
// conceivably come up in real life if deploying a single-AZ region intentionally.
let respect_azs = nodes
.values()
.map(|n| n.get_availability_zone_id())
.unique()
.count()
> 1;
// Step 1: collect all shards that we are required to migrate back to this node because their AZ preference
// requires it.
for (tsid, tenant_shard) in tenants {
if !tenant_shard.intent.get_secondary().contains(&node_id) {
// Shard doesn't have a secondary on this node, ignore it.
continue;
}
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
// shards which belong on this node, not to promote shards whose scheduling preference
// would be on their currently attached node. So will avoid promoting shards whose
// home AZ doesn't match the AZ of the node we're filling.
match tenant_shard.preferred_az() {
_ if !respect_azs => {
if let Some(primary) = tenant_shard.intent.get_attached() {
free_tids_by_node.entry(*primary).or_default().push(*tsid);
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
// shards which belong on this node, not to promote shards whose scheduling preference
// would be on their currently attached node. So will avoid promoting shards whose
// home AZ doesn't match the AZ of the node we're filling.
match tenant_shard.preferred_az() {
None => {
// Shard doesn't have an AZ preference: it is elegible to be moved.
}
Some(az) if az == &node_az => {
// This shard's home AZ is equal to the node we're filling: it is
// elegible to be moved: fall through;
}
Some(_) => {
// This shard's home AZ is somewhere other than the node we're filling:
// do not include it in the fill plan.
return None;
}
}
None => {
// Shard doesn't have an AZ preference: it is elegible to be moved, but we
// will only do so if our target shard count requires it.
if let Some(primary) = tenant_shard.intent.get_attached() {
free_tids_by_node.entry(*primary).or_default().push(*tsid);
}
}
Some(az) if az == &node_az => {
// This shard's home AZ is equal to the node we're filling: it should
// be moved back to this node as part of filling, unless its currently
// attached location is also in its home AZ.
if let Some(primary) = tenant_shard.intent.get_attached() {
if nodes
.get(primary)
.expect("referenced node must exist")
.get_availability_zone_id()
!= tenant_shard
.preferred_az()
.expect("tenant must have an AZ preference")
{
plan.push(*tsid)
}
} else {
plan.push(*tsid)
}
}
Some(_) => {
// This shard's home AZ is somewhere other than the node we're filling,
// it may not be moved back to this node as part of filling. Ignore it
}
}
}
// Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
if tenant_shard.intent.get_secondary().contains(&node_id) {
if let Some(primary) = tenant_shard.intent.get_attached() {
return Some((*primary, *tid));
}
}
None
})
.into_group_map();
let expected_attached = locked.scheduler.expected_attached_shard_count();
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
let mut plan = Vec::new();
for (node_id, attached) in nodes_by_load {
let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
@@ -7376,7 +7346,7 @@ impl Service {
}
if plan.len() >= fill_requirement
|| free_tids_by_node.is_empty()
|| tids_by_node.is_empty()
|| attached <= expected_attached
{
break;
@@ -7388,7 +7358,7 @@ impl Service {
let mut remove_node = false;
while take > 0 {
match free_tids_by_node.get_mut(&node_id) {
match tids_by_node.get_mut(&node_id) {
Some(tids) => match tids.pop() {
Some(tid) => {
let max_promote_for_tenant = std::cmp::max(
@@ -7414,7 +7384,7 @@ impl Service {
}
if remove_node {
free_tids_by_node.remove(&node_id);
tids_by_node.remove(&node_id);
}
}
@@ -7681,16 +7651,6 @@ impl Service {
self.persistence.safekeeper_upsert(record).await
}
pub(crate) async fn set_safekeeper_scheduling_policy(
&self,
id: i64,
scheduling_policy: SkSchedulingPolicy,
) -> Result<(), DatabaseError> {
self.persistence
.set_safekeeper_scheduling_policy(id, scheduling_policy)
.await
}
pub(crate) async fn update_shards_preferred_azs(
&self,
req: ShardsPreferredAzsRequest,

View File

@@ -1,17 +1,11 @@
use std::{
collections::{BTreeMap, HashMap},
sync::Arc,
time::Duration,
};
use std::{sync::Arc, time::Duration};
use pageserver_api::controller_api::ShardSchedulingPolicy;
use rand::seq::SliceRandom;
use rand::thread_rng;
use tokio_util::sync::CancellationToken;
use utils::id::NodeId;
use utils::shard::TenantShardId;
use super::{Node, Scheduler, Service, TenantShard};
use super::Service;
pub struct ChaosInjector {
service: Arc<Service>,
@@ -41,86 +35,50 @@ impl ChaosInjector {
}
}
/// If a shard has a secondary and attached location, then re-assign the secondary to be
/// attached and the attached to be secondary.
///
/// Only modifies tenants if they're in Active scheduling policy.
fn maybe_migrate_to_secondary(
&self,
tenant_shard_id: TenantShardId,
nodes: &Arc<HashMap<NodeId, Node>>,
tenants: &mut BTreeMap<TenantShardId, TenantShard>,
scheduler: &mut Scheduler,
) {
let shard = tenants
.get_mut(&tenant_shard_id)
.expect("Held lock between choosing ID and this get");
if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
// Skip non-active scheduling policies, so that a shard with a policy like Pause can
// be pinned without being disrupted by us.
tracing::info!(
"Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
shard.get_scheduling_policy()
);
return;
}
// Pick a secondary to promote
let Some(new_location) = shard
.intent
.get_secondary()
.choose(&mut thread_rng())
.cloned()
else {
tracing::info!(
"Skipping shard {tenant_shard_id}: no secondary location, can't migrate"
);
return;
};
let Some(old_location) = *shard.intent.get_attached() else {
tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location");
return;
};
tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}");
shard.intent.demote_attached(scheduler, old_location);
shard.intent.promote_attached(scheduler, new_location);
self.service.maybe_reconcile_shard(shard, nodes);
}
async fn inject_chaos(&mut self) {
// Pick some shards to interfere with
let batch_size = 128;
let mut inner = self.service.inner.write().unwrap();
let (nodes, tenants, scheduler) = inner.parts_mut();
let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
// Prefer to migrate tenants that are currently outside their home AZ. This avoids the chaos injector
// continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
// random tenants to move, and then on next chaos iteration moving them back, then picking some new
// random tenants on the next iteration.
let mut victims = Vec::with_capacity(batch_size);
for shard in tenants.values() {
if shard.is_attached_outside_preferred_az(nodes) {
victims.push(shard.tenant_shard_id);
}
if victims.len() >= batch_size {
break;
}
}
let choose_random = batch_size.saturating_sub(victims.len());
tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
victims.extend(random_victims);
let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
for victim in victims {
self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
let shard = tenants
.get_mut(victim)
.expect("Held lock between choosing ID and this get");
if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
// Skip non-active scheduling policies, so that a shard with a policy like Pause can
// be pinned without being disrupted by us.
tracing::info!(
"Skipping shard {victim}: scheduling policy is {:?}",
shard.get_scheduling_policy()
);
continue;
}
// Pick a secondary to promote
let Some(new_location) = shard
.intent
.get_secondary()
.choose(&mut thread_rng())
.cloned()
else {
tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
continue;
};
let Some(old_location) = *shard.intent.get_attached() else {
tracing::info!("Skipping shard {victim}: currently has no attached location");
continue;
};
tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
shard.intent.demote_attached(scheduler, old_location);
shard.intent.promote_attached(scheduler, new_location);
self.service.maybe_reconcile_shard(shard, nodes);
}
}
}

View File

@@ -1793,23 +1793,6 @@ impl TenantShard {
}
}
}
/// Returns true if the tenant shard is attached to a node that is outside the preferred AZ.
///
/// If the shard does not have a preferred AZ, returns false.
pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap<NodeId, Node>) -> bool {
self.intent
.get_attached()
.map(|node_id| {
Some(
nodes
.get(&node_id)
.expect("referenced node exists")
.get_availability_zone_id(),
) == self.intent.preferred_az_id.as_ref()
})
.unwrap_or(false)
}
}
impl Drop for TenantShard {

View File

@@ -370,7 +370,6 @@ class NeonEnvBuilder:
pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None,
num_safekeepers: int = 1,
num_pageservers: int = 1,
num_azs: int = 1,
# Use non-standard SK ids to check for various parsing bugs
safekeepers_id_start: int = 0,
# fsync is disabled by default to make the tests go faster
@@ -402,7 +401,6 @@ class NeonEnvBuilder:
self.pageserver_config_override = pageserver_config_override
self.num_safekeepers = num_safekeepers
self.num_pageservers = num_pageservers
self.num_azs = num_azs
self.safekeepers_id_start = safekeepers_id_start
self.safekeepers_enable_fsync = safekeepers_enable_fsync
self.auth_enabled = auth_enabled
@@ -992,7 +990,6 @@ class NeonEnv:
self.endpoints = EndpointFactory(self)
self.safekeepers: list[Safekeeper] = []
self.pageservers: list[NeonPageserver] = []
self.num_azs = config.num_azs
self.broker = NeonBroker(self)
self.pageserver_remote_storage = config.pageserver_remote_storage
self.safekeepers_remote_storage = config.safekeepers_remote_storage
@@ -1093,21 +1090,14 @@ class NeonEnv:
http=self.port_distributor.get_port(),
)
# Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override`
if self.num_azs > 1:
# Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc.
az_prefix = DEFAULT_AZ_ID[:-1]
availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}"
else:
availability_zone = DEFAULT_AZ_ID
ps_cfg: dict[str, Any] = {
"id": ps_id,
"listen_pg_addr": f"localhost:{pageserver_port.pg}",
"listen_http_addr": f"localhost:{pageserver_port.http}",
"pg_auth_type": pg_auth_type,
"http_auth_type": http_auth_type,
"availability_zone": availability_zone,
# Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
"availability_zone": DEFAULT_AZ_ID,
# Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
# the pageserver taking a long time to start up due to syncfs flushing other tests' data
"no_sync": True,
@@ -2346,14 +2336,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
json=body,
)
def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str):
self.request(
"POST",
f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy",
headers=self.headers(TokenScope.ADMIN),
json={"id": id, "scheduling_policy": scheduling_policy},
)
def get_safekeeper(self, id: int) -> dict[str, Any] | None:
try:
response = self.request(
@@ -4153,7 +4135,7 @@ class Endpoint(PgProtocol, LogUtils):
# Checkpoints running endpoint and returns pg_wal size in MB.
def get_pg_wal_size(self):
log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}")
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
self.safe_psql("checkpoint")
assert self.pgdata_dir is not None # please mypy
return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
@@ -4993,7 +4975,7 @@ def logical_replication_sync(
if res:
log.info(f"subscriber_lsn={res}")
subscriber_lsn = Lsn(res)
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
if subscriber_lsn >= publisher_lsn:
return subscriber_lsn
time.sleep(0.5)

View File

@@ -53,22 +53,6 @@ class Workload:
self._endpoint: Endpoint | None = None
self._endpoint_opts = endpoint_opts or {}
def branch(
self,
timeline_id: TimelineId,
branch_name: str | None = None,
endpoint_opts: dict[str, Any] | None = None,
) -> Workload:
"""
Checkpoint the current status of the workload in case of branching
"""
branch_workload = Workload(
self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts
)
branch_workload.expect_rows = self.expect_rows
branch_workload.churn_cursor = self.churn_cursor
return branch_workload
def reconfigure(self) -> None:
"""
Request the endpoint to reconfigure based on location reported by storage controller

View File

@@ -112,11 +112,7 @@ page_cache_size=10
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
"with_branches",
["with_branches", "no_branches"],
)
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str):
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
SMOKE_CONF = {
# Run both gc and gc-compaction.
"gc_period": "5s",
@@ -147,17 +143,12 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
log.info("Writing initial data ...")
workload.write_rows(row_count, env.pageserver.id)
child_workloads: list[Workload] = []
for i in range(1, churn_rounds + 1):
if i % 10 == 0:
log.info(f"Running churn round {i}/{churn_rounds} ...")
if i % 10 == 5 and with_branches == "with_branches":
branch_name = f"child-{i}"
branch_timeline_id = env.create_branch(branch_name)
child_workloads.append(workload.branch(branch_timeline_id, branch_name))
if (i - 1) % 10 == 0 or (i - 1) % 10 == 1:
# Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time.
if (i - 1) % 10 == 0:
# Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
ps_http.timeline_compact(
tenant_id,
timeline_id,
@@ -188,9 +179,6 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)
for child_workload in child_workloads:
log.info(f"Validating at branch {child_workload.branch_name}")
child_workload.validate(env.pageserver.id)
# Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)

View File

@@ -7,78 +7,9 @@ import threading
import time
import pytest
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.utils import USE_LFC, query_scalar
"""
Test whether LFC doesn't error out when the LRU is empty, but the LFC is
already at its maximum size.
If we don't handle this safely, we might allocate more hash entries than
otherwise considered safe, thus causing ERRORs in hash_search(HASH_ENTER) once
we hit lfc->used >= lfc->limit.
"""
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_local_file_cache_all_pinned(neon_simple_env: NeonEnv):
env = neon_simple_env
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"neon.max_file_cache_size='1MB'",
"neon.file_cache_size_limit='1MB'",
],
)
top_cur = endpoint.connect().cursor()
stop = threading.Event()
n_rows = 10000
n_threads = 5
n_updates_per_connection = 1000
top_cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
top_cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
# Start threads that will perform random UPDATEs. Each UPDATE
# increments the counter on the row, so that we can check at the
# end that the sum of all the counters match the number of updates
# performed (plus the initial 1 on each row).
#
# Furthermore, each thread will reconnect between every 1000 updates.
def run_updates(n_updates_performed_q: queue.Queue[int]):
n_updates_performed = 0
conn = endpoint.connect()
cur = conn.cursor()
while not stop.is_set():
id = random.randint(1, n_rows)
cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
n_updates_performed += 1
if n_updates_performed % n_updates_per_connection == 0:
cur.close()
conn.close()
conn = endpoint.connect()
cur = conn.cursor()
n_updates_performed_q.put(n_updates_performed)
n_updates_performed_q: queue.Queue[int] = queue.Queue()
threads: list[threading.Thread] = []
for _i in range(n_threads):
thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
thread.start()
threads.append(thread)
time.sleep(15)
stop.set()
n_updates_performed = 0
for thread in threads:
thread.join()
n_updates_performed += n_updates_performed_q.get()
assert query_scalar(top_cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):

View File

@@ -4,19 +4,9 @@ import time
from fixtures.neon_fixtures import NeonEnv
BTREE_NUM_CYCLEID_PAGES = """
WITH lsns AS (
/*
* pg_switch_wal() ensures we have an LSN that
* 1. is after any previous modifications, but also,
* 2. (critically) is flushed, preventing any issues with waiting for
* unflushed WAL in PageServer.
*/
SELECT pg_switch_wal() as lsn
),
raw_pages AS (
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno,
lsns l(lsn)
WITH raw_pages AS (
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
),
parsed_pages AS (
/* cycle ID is the last 2 bytes of the btree page */
@@ -46,6 +36,7 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
ses1.execute("SELECT neon_xlogflush();")
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
pages = ses1.fetchall()
assert (
@@ -66,6 +57,7 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
ses1.execute("DELETE FROM t WHERE id <= 610;")
# Flush wal, for checking purposes
ses1.execute("SELECT neon_xlogflush();")
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
pages = ses1.fetchall()
assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
@@ -116,6 +108,8 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
# unpin the btree page, allowing s3's vacuum to complete
ses2.execute("FETCH ALL FROM foo;")
ses2.execute("ROLLBACK;")
# flush WAL to make sure PS is up-to-date
ses1.execute("SELECT neon_xlogflush();")
# check that our expectations are correct
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
pages = ses1.fetchall()

View File

@@ -6,9 +6,14 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
from fixtures.pageserver.http import PageserverHttpClient
def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
def check_tenant(
env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int
):
tenant_id, timeline_id = env.create_tenant()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
config_lines = [
f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
]
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
# we rely upon autocommit after each statement
res_1 = endpoint.safe_psql_many(
queries=[
@@ -33,7 +38,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
@pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)])
def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int):
# Test both proto versions until we fully migrate.
@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
def test_normal_work(
neon_env_builder: NeonEnvBuilder,
num_timelines: int,
num_safekeepers: int,
safekeeper_proto_version: int,
):
"""
Basic test:
* create new tenant with a timeline
@@ -52,4 +64,4 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s
pageserver_http = env.pageserver.http_client()
for _ in range(num_timelines):
check_tenant(env, pageserver_http)
check_tenant(env, pageserver_http, safekeeper_proto_version)

View File

@@ -1,60 +0,0 @@
# NB: there are benchmarks that double-serve as tests inside the `performance` directory.
import subprocess
from pathlib import Path
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct
@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"])
def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str):
def patch_pageserver_toml(config):
config["page_service_pipelining"] = {
"mode": "pipelined",
"max_batch_size": 32,
"execution": "concurrent-futures",
}
neon_env_builder.pageserver_config_override = patch_pageserver_toml
env = neon_env_builder.init_start()
log.info("make flush appear slow")
log.info("sending requests until pageserver accepts no more")
# TODO: extract this into a helper, like subprocess_capture,
# so that we capture the stderr from the helper somewhere.
child = subprocess.Popen(
[
neon_binpath / "test_helper_slow_client_reads",
env.pageserver.connstr(),
str(env.initial_tenant),
str(env.initial_timeline),
],
bufsize=0, # unbuffered
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
)
assert child.stdout is not None
buf = child.stdout.read(1)
if len(buf) != 1:
raise Exception("unexpected EOF")
if buf != b"R":
raise Exception(f"unexpected data: {buf!r}")
log.info("helper reports pageserver accepts no more requests")
log.info(
"assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace"
)
if kind == "pageserver-stop":
log.info("try to shut down the pageserver cleanly")
env.pageserver.stop()
elif kind == "tenant-detach":
log.info("try to shut down the tenant")
env.pageserver.tenant_detach(env.initial_tenant)
else:
raise ValueError(f"unexpected kind: {kind}")
log.info("shutdown did not time out, test passed")

View File

@@ -2394,7 +2394,6 @@ def test_storage_controller_node_deletion(
Test that deleting a node works & properly reschedules everything that was on the node.
"""
neon_env_builder.num_pageservers = 3
neon_env_builder.num_azs = 3
env = neon_env_builder.init_configs()
env.start()
@@ -2408,9 +2407,6 @@ def test_storage_controller_node_deletion(
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
)
# Sanity check: initial creations should not leave the system in an unstable scheduling state
assert env.storage_controller.reconcile_all() == 0
victim = env.pageservers[-1]
# The procedure a human would follow is:
@@ -3212,17 +3208,6 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
assert eq_safekeeper_records(body, inserted_now)
# some small tests for the scheduling policy querying and returning APIs
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Pause"
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
newest_info = target.get_safekeeper(inserted["id"])
assert newest_info
assert newest_info["scheduling_policy"] == "Decomissioned"
# Ensure idempotency
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
compared = [dict(a), dict(b)]

View File

@@ -1,7 +1,6 @@
from __future__ import annotations
import json
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
import pytest
@@ -254,8 +253,29 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))
def timeline_create():
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
raise RuntimeError("creation succeeded even though it shouldn't")
try:
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
raise RuntimeError("creation succeeded even though it shouldn't")
except ReadTimeout:
pass
Thread(target=timeline_create).start()
def hit_initdb_upload_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
wait_until(hit_initdb_upload_failpoint)
def creation_connection_timed_out():
env.pageserver.assert_log_contains(
"POST.*/timeline.* request was dropped before completing"
)
# Wait so that we hit the timeout and the connection is dropped
# (But timeline creation still continues)
wait_until(creation_connection_timed_out)
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
def tenant_delete():
def tenant_delete_inner():
@@ -263,46 +283,21 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
wait_until(tenant_delete_inner)
# We will spawn background threads for timeline creation and tenant deletion. They will both
# get blocked on our failpoint.
with ThreadPoolExecutor(max_workers=1) as executor:
create_fut = executor.submit(timeline_create)
Thread(target=tenant_delete).start()
def hit_initdb_upload_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
def deletion_arrived():
env.pageserver.assert_log_contains(
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
)
wait_until(hit_initdb_upload_failpoint)
wait_until(deletion_arrived)
def creation_connection_timed_out():
env.pageserver.assert_log_contains(
"POST.*/timeline.* request was dropped before completing"
)
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
# Wait so that we hit the timeout and the connection is dropped
# (But timeline creation still continues)
wait_until(creation_connection_timed_out)
# Disable the failpoint and wait for deletion to finish
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
with pytest.raises(ReadTimeout):
# Our creation failed from the client's point of view.
create_fut.result()
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
delete_fut = executor.submit(tenant_delete)
def deletion_arrived():
env.pageserver.assert_log_contains(
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
)
wait_until(deletion_arrived)
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
# Disable the failpoint and wait for deletion to finish
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
delete_fut.result()
ps_http.tenant_delete(tenant_id)
# Physical deletion should have happened
assert_prefix_empty(

View File

@@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
io_metrics = query_all_safekeepers(
"safekeeper_pg_io_bytes_total",
{
"app_name": f"pageserver-{env.pageserver.id}",
"app_name": "pageserver",
"client_az": "test_ps_az",
"dir": io_direction,
"same_az": "false",

View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.2",
"a8dd6e779dde907778006adb436b557ad652fb97"
"0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
],
"v16": [
"16.6",
"d674efd776f59d78e8fa1535bd2f95c3e6984fca"
"f63b141cfb0c813725a6b2574049565bff643018"
],
"v15": [
"15.10",
"dd0b28d6fbad39e227f3b77296fcca879af8b3a9"
"d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
],
"v14": [
"14.15",
"46082f20884f087a2d974b33ac65d63af26142bd"
"210a0ba3afd8134ea910b203f274b165bd4f05d7"
]
}