mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 04:30:38 +00:00
Compare commits
19 Commits
sk-members
...
jcsp/frida
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fec5ac5838 | ||
|
|
da13154791 | ||
|
|
2e13a3aa7a | ||
|
|
cccc196848 | ||
|
|
e436dcad57 | ||
|
|
21d7b6a258 | ||
|
|
86dbc44db1 | ||
|
|
58f6af6c9a | ||
|
|
7be971081a | ||
|
|
6fe4c6798f | ||
|
|
2eda484ef6 | ||
|
|
c7429af8a0 | ||
|
|
a753349cb0 | ||
|
|
55a68b28a2 | ||
|
|
fb0e2acb2f | ||
|
|
efaec6cdf8 | ||
|
|
3d41069dc4 | ||
|
|
dbebede7bf | ||
|
|
3e529f124f |
91
.github/workflows/_check-codestyle-rust.yml
vendored
Normal file
91
.github/workflows/_check-codestyle-rust.yml
vendored
Normal file
@@ -0,0 +1,91 @@
|
||||
name: Check Codestyle Rust
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build-tools-image:
|
||||
description: "build-tools image"
|
||||
required: true
|
||||
type: string
|
||||
archs:
|
||||
description: "Json array of architectures to run on"
|
||||
type: string
|
||||
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
strategy:
|
||||
matrix:
|
||||
arch: ${{ fromJson(inputs.archs) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
76
.github/workflows/build_and_test.yml
vendored
76
.github/workflows/build_and_test.yml
vendored
@@ -164,77 +164,11 @@ jobs:
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
archs: '["x64", "arm64"]'
|
||||
secrets: inherit
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
|
||||
33
.github/workflows/pre-merge-checks.yml
vendored
33
.github/workflows/pre-merge-checks.yml
vendored
@@ -1,6 +1,12 @@
|
||||
name: Pre-merge checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/_check-codestyle-python.yml
|
||||
- .github/workflows/_check-codestyle-rust.yml
|
||||
- .github/workflows/build-build-tools-image.yml
|
||||
- .github/workflows/pre-merge-checks.yml
|
||||
merge_group:
|
||||
branches:
|
||||
- main
|
||||
@@ -17,8 +23,10 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
python-changed: ${{ steps.python-src.outputs.any_changed }}
|
||||
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: python-src
|
||||
with:
|
||||
@@ -30,11 +38,25 @@ jobs:
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: rust-src
|
||||
with:
|
||||
files: |
|
||||
.github/workflows/_check-codestyle-rust.yml
|
||||
.github/workflows/build-build-tools-image.yml
|
||||
.github/workflows/pre-merge-checks.yml
|
||||
**/**.rs
|
||||
**/Cargo.toml
|
||||
Cargo.toml
|
||||
Cargo.lock
|
||||
|
||||
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
|
||||
env:
|
||||
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
|
||||
RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
|
||||
run: |
|
||||
echo "${PYTHON_CHANGED_FILES}"
|
||||
echo "${RUST_CHANGED_FILES}"
|
||||
|
||||
build-build-tools-image:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
@@ -55,6 +77,16 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-rust:
|
||||
if: needs.get-changed-files.outputs.rust-changed == 'true'
|
||||
needs: [ get-changed-files, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
archs: '["x64"]'
|
||||
secrets: inherit
|
||||
|
||||
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
|
||||
# Currently we require 2 jobs (checks with exact name):
|
||||
# - conclusion
|
||||
@@ -67,6 +99,7 @@ jobs:
|
||||
needs:
|
||||
- get-changed-files
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Create fake `neon-cloud-e2e` check
|
||||
|
||||
6
Cargo.lock
generated
6
Cargo.lock
generated
@@ -3981,9 +3981,11 @@ name = "pagectl"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
"camino",
|
||||
"clap",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
@@ -4005,6 +4007,7 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -5655,6 +5658,7 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"desim",
|
||||
"env_logger 0.10.2",
|
||||
"fail",
|
||||
"futures",
|
||||
"hex",
|
||||
@@ -5683,6 +5687,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
@@ -5709,6 +5714,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"const_format",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"serde",
|
||||
|
||||
@@ -66,6 +66,7 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
@@ -871,7 +872,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.12.6 cargo-pgrx && \
|
||||
cargo install --locked --version 0.12.9 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
@@ -908,19 +909,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
|
||||
\
|
||||
cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
|
||||
\
|
||||
cd ../rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
|
||||
\
|
||||
cd ../rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
@@ -945,7 +946,8 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
|
||||
# against postgres forks that decided to change their ABI name (like us).
|
||||
# With that we can build extensions without forking them and using stock
|
||||
# pgx. As this feature is new few manual version bumps were required.
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
@@ -963,7 +965,8 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
@@ -984,9 +987,8 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
# TODO update pgrx version in the pg_tiktoken repo and remove this line
|
||||
sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
@@ -1028,7 +1030,11 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
|
||||
#########################################################################################
|
||||
|
||||
@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Parser;
|
||||
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
|
||||
use nix::unistd::Pid;
|
||||
use tracing::{info, info_span, warn, Instrument};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::fs_ext::is_directory_empty;
|
||||
|
||||
#[path = "fast_import/aws_s3_sync.rs"]
|
||||
@@ -41,12 +41,19 @@ mod child_stdio_to_log;
|
||||
#[path = "fast_import/s3_uri.rs"]
|
||||
mod s3_uri;
|
||||
|
||||
const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
|
||||
const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
#[clap(long)]
|
||||
working_directory: Utf8PathBuf,
|
||||
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
|
||||
s3_prefix: s3_uri::S3Uri,
|
||||
s3_prefix: Option<s3_uri::S3Uri>,
|
||||
#[clap(long)]
|
||||
source_connection_string: Option<String>,
|
||||
#[clap(short, long)]
|
||||
interactive: bool,
|
||||
#[clap(long)]
|
||||
pg_bin_dir: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
@@ -77,30 +84,70 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
|
||||
info!("starting");
|
||||
|
||||
let Args {
|
||||
working_directory,
|
||||
s3_prefix,
|
||||
pg_bin_dir,
|
||||
pg_lib_dir,
|
||||
} = Args::parse();
|
||||
let args = Args::parse();
|
||||
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
// Validate arguments
|
||||
if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
|
||||
anyhow::bail!("either s3_prefix or source_connection_string must be specified");
|
||||
}
|
||||
if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
|
||||
anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
|
||||
}
|
||||
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(&aws_config);
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
let working_directory = args.working_directory;
|
||||
let pg_bin_dir = args.pg_bin_dir;
|
||||
let pg_lib_dir = args.pg_lib_dir;
|
||||
|
||||
// Initialize AWS clients only if s3_prefix is specified
|
||||
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
|
||||
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms = aws_sdk_kms::Client::new(&config);
|
||||
(Some(config), Some(kms))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
// Get source connection string either from S3 spec or direct argument
|
||||
let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
};
|
||||
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.unwrap()
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
} else {
|
||||
args.source_connection_string.unwrap()
|
||||
};
|
||||
|
||||
match tokio::fs::create_dir(&working_directory).await {
|
||||
@@ -123,15 +170,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.await
|
||||
.context("create pgdata directory")?;
|
||||
|
||||
//
|
||||
// Setup clients
|
||||
//
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms_client = aws_sdk_kms::Client::new(&aws_config);
|
||||
|
||||
//
|
||||
// Initialize pgdata
|
||||
//
|
||||
let pgbin = pg_bin_dir.join("postgres");
|
||||
let pg_version = match get_pg_version(pgbin.as_ref()) {
|
||||
PostgresMajorVersion::V14 => 14,
|
||||
@@ -170,7 +208,13 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.args(["-c", &format!("max_parallel_workers={nproc}")])
|
||||
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
|
||||
.args(["-c", &format!("max_worker_processes={nproc}")])
|
||||
.args(["-c", "effective_io_concurrency=100"])
|
||||
.args([
|
||||
"-c",
|
||||
&format!(
|
||||
"effective_io_concurrency={}",
|
||||
if cfg!(target_os = "macos") { 0 } else { 100 }
|
||||
),
|
||||
])
|
||||
.env_clear()
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
@@ -185,44 +229,58 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.instrument(info_span!("postgres")),
|
||||
);
|
||||
|
||||
// Create neondb database in the running postgres
|
||||
let restore_pg_connstring =
|
||||
format!("host=localhost port=5432 user={superuser} dbname=postgres");
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
|
||||
if res.is_ok() {
|
||||
info!("postgres is ready, could connect to it");
|
||||
break;
|
||||
if start_time.elapsed() > PG_WAIT_TIMEOUT {
|
||||
error!(
|
||||
"timeout exceeded: failed to poll postgres and create database within 10 minutes"
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
|
||||
Ok((client, connection)) => {
|
||||
// Spawn the connection handling task to maintain the connection
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
warn!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
match client.simple_query("CREATE DATABASE neondb;").await {
|
||||
Ok(_) => {
|
||||
info!("created neondb database");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to create database: {}, retying in {}s",
|
||||
e,
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
info!(
|
||||
"postgres not ready yet, retrying in {}s",
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Decrypt connection string
|
||||
//
|
||||
let source_connection_string = {
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Start the work
|
||||
//
|
||||
let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
|
||||
|
||||
let dumpdir = working_directory.join("dumpdir");
|
||||
|
||||
@@ -310,6 +368,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// If interactive mode, wait for Ctrl+C
|
||||
if args.interactive {
|
||||
info!("Running in interactive mode. Press Ctrl+C to shut down.");
|
||||
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
|
||||
}
|
||||
|
||||
info!("shutdown postgres");
|
||||
{
|
||||
nix::sys::signal::kill(
|
||||
@@ -325,21 +389,24 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.context("wait for postgres to shut down")?;
|
||||
}
|
||||
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync dump directory to destination")?;
|
||||
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
// Only sync if s3_prefix was specified
|
||||
if let Some(s3_prefix) = args.s3_prefix {
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
.context("sync dump directory to destination")?;
|
||||
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -17,7 +17,8 @@ use crate::{
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct ExtensionServerParams {
|
||||
is_library: Option<bool>,
|
||||
#[serde(default)]
|
||||
is_library: bool,
|
||||
}
|
||||
|
||||
/// Download a remote extension.
|
||||
@@ -51,7 +52,7 @@ pub(in crate::http) async fn download_extension(
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
params.is_library.unwrap_or(false),
|
||||
params.is_library,
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
)
|
||||
|
||||
@@ -9,8 +9,9 @@ use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
|
||||
TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -231,6 +232,13 @@ enum Command {
|
||||
},
|
||||
/// List safekeepers known to the storage controller
|
||||
Safekeepers {},
|
||||
/// Set the scheduling policy of the specified safekeeper
|
||||
SafekeeperScheduling {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
scheduling_policy: SkSchedulingPolicyArg,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -283,6 +291,17 @@ impl FromStr for PlacementPolicyArg {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct SkSchedulingPolicyArg(SkSchedulingPolicy);
|
||||
|
||||
impl FromStr for SkSchedulingPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
SkSchedulingPolicy::from_str(s).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||
|
||||
@@ -1202,6 +1221,23 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::SafekeeperScheduling {
|
||||
node_id,
|
||||
scheduling_policy,
|
||||
} => {
|
||||
let scheduling_policy = scheduling_policy.0;
|
||||
storcon_client
|
||||
.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
|
||||
Method::POST,
|
||||
format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
|
||||
Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Scheduling policy of {node_id} set to {}",
|
||||
String::from(scheduling_policy)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -324,7 +324,7 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum SkSchedulingPolicy {
|
||||
Active,
|
||||
Disabled,
|
||||
Pause,
|
||||
Decomissioned,
|
||||
}
|
||||
|
||||
@@ -334,9 +334,13 @@ impl FromStr for SkSchedulingPolicy {
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(match s {
|
||||
"active" => Self::Active,
|
||||
"disabled" => Self::Disabled,
|
||||
"pause" => Self::Pause,
|
||||
"decomissioned" => Self::Decomissioned,
|
||||
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,pause,decomissioned"
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -346,7 +350,7 @@ impl From<SkSchedulingPolicy> for String {
|
||||
use SkSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Disabled => "disabled",
|
||||
Pause => "pause",
|
||||
Decomissioned => "decomissioned",
|
||||
}
|
||||
.to_string()
|
||||
@@ -416,8 +420,6 @@ pub struct MetadataHealthListOutdatedResponse {
|
||||
}
|
||||
|
||||
/// Publicly exposed safekeeper description
|
||||
///
|
||||
/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperDescribeResponse {
|
||||
pub id: NodeId,
|
||||
@@ -433,6 +435,11 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
@@ -24,7 +24,9 @@ pub struct Key {
|
||||
|
||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
||||
/// a struct of fields.
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
|
||||
#[derive(
|
||||
Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
|
||||
)]
|
||||
pub struct CompactKey(i128);
|
||||
|
||||
/// The storage key size.
|
||||
|
||||
@@ -29,7 +29,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use crate::{
|
||||
key::Key,
|
||||
key::{CompactKey, Key},
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -1981,6 +1981,23 @@ impl PagestreamBeMessage {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PageTraceEvent {
|
||||
pub key: CompactKey,
|
||||
pub effective_lsn: Lsn,
|
||||
pub time: SystemTime,
|
||||
}
|
||||
|
||||
impl Default for PageTraceEvent {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
key: Default::default(),
|
||||
effective_lsn: Default::default(),
|
||||
time: std::time::UNIX_EPOCH,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
|
||||
@@ -13,3 +13,4 @@ postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
tokio.workspace = true
|
||||
utils.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -38,12 +38,14 @@ impl Display for SafekeeperId {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(transparent)]
|
||||
pub struct MemberSet {
|
||||
pub m: Vec<SafekeeperId>,
|
||||
pub members: Vec<SafekeeperId>,
|
||||
}
|
||||
|
||||
impl MemberSet {
|
||||
pub fn empty() -> Self {
|
||||
MemberSet { m: Vec::new() }
|
||||
MemberSet {
|
||||
members: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
|
||||
@@ -51,11 +53,11 @@ impl MemberSet {
|
||||
if hs.len() != members.len() {
|
||||
bail!("duplicate safekeeper id in the set {:?}", members);
|
||||
}
|
||||
Ok(MemberSet { m: members })
|
||||
Ok(MemberSet { members })
|
||||
}
|
||||
|
||||
pub fn contains(&self, sk: &SafekeeperId) -> bool {
|
||||
self.m.iter().any(|m| m.id == sk.id)
|
||||
self.members.iter().any(|m| m.id == sk.id)
|
||||
}
|
||||
|
||||
pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
|
||||
@@ -65,7 +67,7 @@ impl MemberSet {
|
||||
sk.id, self
|
||||
));
|
||||
}
|
||||
self.m.push(sk);
|
||||
self.members.push(sk);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -73,7 +75,11 @@ impl MemberSet {
|
||||
impl Display for MemberSet {
|
||||
/// Display as a comma separated list of members.
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
|
||||
let sks_str = self
|
||||
.members
|
||||
.iter()
|
||||
.map(|m| m.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "({})", sks_str.join(", "))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! Types used in safekeeper http API. Many of them are also reused internally.
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::SocketAddr;
|
||||
@@ -146,7 +147,13 @@ pub type ConnectionId = u32;
|
||||
|
||||
/// Serialize is used only for json'ing in API response. Also used internally.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalSenderState {
|
||||
pub enum WalSenderState {
|
||||
Vanilla(VanillaWalSenderState),
|
||||
Interpreted(InterpretedWalSenderState),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VanillaWalSenderState {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub addr: SocketAddr,
|
||||
pub conn_id: ConnectionId,
|
||||
@@ -155,6 +162,17 @@ pub struct WalSenderState {
|
||||
pub feedback: ReplicationFeedback,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct InterpretedWalSenderState {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub shard: ShardIdentity,
|
||||
pub addr: SocketAddr,
|
||||
pub conn_id: ConnectionId,
|
||||
// postgres application_name
|
||||
pub appname: Option<String>,
|
||||
pub feedback: ReplicationFeedback,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalReceiverState {
|
||||
/// None means it is recovery initiated by us (this safekeeper).
|
||||
|
||||
54
libs/utils/src/guard_arc_swap.rs
Normal file
54
libs/utils/src/guard_arc_swap.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
|
||||
//! don't block reads.
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::TryLockError;
|
||||
|
||||
pub struct GuardArcSwap<T> {
|
||||
inner: ArcSwap<T>,
|
||||
guard: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
pub struct Guard<'a, T> {
|
||||
_guard: tokio::sync::MutexGuard<'a, ()>,
|
||||
inner: &'a ArcSwap<T>,
|
||||
}
|
||||
|
||||
impl<T> GuardArcSwap<T> {
|
||||
pub fn new(inner: T) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(inner)),
|
||||
guard: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub async fn write_guard(&self) -> Guard<'_, T> {
|
||||
Guard {
|
||||
_guard: self.guard.lock().await,
|
||||
inner: &self.inner,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
|
||||
let guard = self.guard.try_lock()?;
|
||||
Ok(Guard {
|
||||
_guard: guard,
|
||||
inner: &self.inner,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Guard<'_, T> {
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub fn write(&mut self, value: T) {
|
||||
self.inner.store(Arc::new(value));
|
||||
}
|
||||
}
|
||||
@@ -98,6 +98,8 @@ pub mod try_rcu;
|
||||
|
||||
pub mod pprof;
|
||||
|
||||
pub mod guard_arc_swap;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
@@ -64,7 +64,7 @@ pub struct InterpretedWalRecords {
|
||||
}
|
||||
|
||||
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct InterpretedWalRecord {
|
||||
/// Optional metadata record - may cause writes to metadata keys
|
||||
/// in the storage engine
|
||||
|
||||
@@ -32,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
/// relation sizes. In the case of "observed" values, we only need to know
|
||||
/// the key and LSN, so two types of metadata are supported to save on network
|
||||
/// bandwidth.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum ValueMeta {
|
||||
Serialized(SerializedValueMeta),
|
||||
Observed(ObservedValueMeta),
|
||||
@@ -79,7 +79,7 @@ impl PartialEq for OrderedValueMeta {
|
||||
impl Eq for OrderedValueMeta {}
|
||||
|
||||
/// Metadata for a [`Value`] serialized into the batch.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SerializedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
@@ -91,14 +91,14 @@ pub struct SerializedValueMeta {
|
||||
}
|
||||
|
||||
/// Metadata for a [`Value`] observed by the batch
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct ObservedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Batch of serialized [`Value`]s.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SerializedValueBatch {
|
||||
/// [`Value`]s serialized in EphemeralFile's native format,
|
||||
/// ready for disk write by the pageserver
|
||||
|
||||
@@ -215,7 +215,6 @@ impl Wrapper {
|
||||
syncSafekeepers: config.sync_safekeepers,
|
||||
systemId: 0,
|
||||
pgTimeline: 1,
|
||||
proto_version: 2,
|
||||
callback_data,
|
||||
};
|
||||
let c_config = Box::into_raw(Box::new(c_config));
|
||||
|
||||
@@ -16,6 +16,7 @@ arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
bit_field.workspace = true
|
||||
bincode.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
|
||||
@@ -8,9 +8,11 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
humantime.workspace = true
|
||||
itertools.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
|
||||
@@ -9,7 +9,9 @@ mod index_part;
|
||||
mod key;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
mod page_trace;
|
||||
|
||||
use page_trace::PageTraceCmd;
|
||||
use std::{
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -64,6 +66,7 @@ enum Commands {
|
||||
Layer(LayerCmd),
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
PageTrace(PageTraceCmd),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
73
pageserver/ctl/src/page_trace.rs
Normal file
73
pageserver/ctl/src/page_trace.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::BufReader;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::key::{CompactKey, Key};
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats.
|
||||
#[derive(Parser)]
|
||||
pub(crate) struct PageTraceCmd {
|
||||
/// Trace input file.
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> {
|
||||
let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?);
|
||||
let mut events: Vec<PageTraceEvent> = Vec::new();
|
||||
loop {
|
||||
match bincode::deserialize_from(&mut file) {
|
||||
Ok(event) => events.push(event),
|
||||
Err(err) => {
|
||||
if let bincode::ErrorKind::Io(ref err) = *err {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Err(err.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut reads_by_relation: HashMap<RelTag, i64> = HashMap::new();
|
||||
let mut reads_by_key: HashMap<CompactKey, i64> = HashMap::new();
|
||||
|
||||
for event in events {
|
||||
let key = Key::from_compact(event.key);
|
||||
let reltag = RelTag {
|
||||
spcnode: key.field2,
|
||||
dbnode: key.field3,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
};
|
||||
|
||||
*reads_by_relation.entry(reltag).or_default() += 1;
|
||||
*reads_by_key.entry(event.key).or_default() += 1;
|
||||
}
|
||||
|
||||
let multi_read_keys = reads_by_key
|
||||
.into_iter()
|
||||
.filter(|(_, count)| *count > 1)
|
||||
.sorted_by_key(|(key, count)| (-*count, *key))
|
||||
.collect_vec();
|
||||
|
||||
println!("Multi-read keys: {}", multi_read_keys.len());
|
||||
for (key, count) in multi_read_keys {
|
||||
println!(" {key}: {count}");
|
||||
}
|
||||
|
||||
let reads_by_relation = reads_by_relation
|
||||
.into_iter()
|
||||
.sorted_by_key(|(rel, count)| (-*count, *rel))
|
||||
.collect_vec();
|
||||
|
||||
println!("Reads by relation:");
|
||||
for (reltag, count) in reads_by_relation {
|
||||
println!(" {reltag}: {count}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -27,6 +27,7 @@ use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::OffloadedTimelineInfo;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantConfigPatchRequest;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
@@ -51,7 +52,9 @@ use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
use scopeguard::defer;
|
||||
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -1521,6 +1524,71 @@ async fn timeline_gc_unblocking_handler(
|
||||
block_or_unblock_gc(request, false).await
|
||||
}
|
||||
|
||||
/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding.
|
||||
/// Use the `pagectl page-trace` command to decode and analyze the output.
|
||||
async fn timeline_page_trace_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024);
|
||||
let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5);
|
||||
|
||||
// Convert size limit to event limit based on the serialized size of an event. The event size is
|
||||
// fixed, as the default bincode serializer uses fixed-width integer encoding.
|
||||
let event_size = bincode::serialize(&PageTraceEvent::default())
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
.len();
|
||||
let event_limit = size_limit / event_size;
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
// Install a page trace, unless one is already in progress. We just use a buffered channel,
|
||||
// which may 2x the memory usage in the worst case, but it's still bounded.
|
||||
let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit);
|
||||
let cur = timeline.page_trace.load();
|
||||
let installed = cur.is_none()
|
||||
&& timeline
|
||||
.page_trace
|
||||
.compare_and_swap(cur, Some(Arc::new(trace_tx)))
|
||||
.is_none();
|
||||
if !installed {
|
||||
return Err(ApiError::Conflict("page trace already active".to_string()));
|
||||
}
|
||||
defer!(timeline.page_trace.store(None)); // uninstall on return
|
||||
|
||||
// Collect the trace and return it to the client. We could stream the response, but this is
|
||||
// simple and fine.
|
||||
let mut body = Vec::with_capacity(size_limit);
|
||||
let deadline = Instant::now() + Duration::from_secs(time_limit_secs);
|
||||
|
||||
while body.len() < size_limit {
|
||||
tokio::select! {
|
||||
event = trace_rx.recv() => {
|
||||
let Some(event) = event else {
|
||||
break; // shouldn't happen (sender doesn't close, unless timeline dropped)
|
||||
};
|
||||
bincode::serialize_into(&mut body, &event)
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
}
|
||||
_ = tokio::time::sleep_until(deadline) => break, // time limit reached
|
||||
_ = cancel.cancelled() => return Err(ApiError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(body))
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
|
||||
///
|
||||
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
|
||||
@@ -3479,6 +3547,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| api_handler(r, timeline_gc_unblocking_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace",
|
||||
|r| api_handler(r, timeline_page_trace_handler),
|
||||
)
|
||||
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
|
||||
api_handler(r, secondary_upload_handler)
|
||||
})
|
||||
|
||||
@@ -67,6 +67,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::{basebackup, timed_after_cancellation};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -1718,6 +1719,20 @@ impl PageServerHandler {
|
||||
.query_metrics
|
||||
.observe_getpage_batch_start(requests.len());
|
||||
|
||||
// If a page trace is running, submit an event for this request.
|
||||
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
|
||||
let time = SystemTime::now();
|
||||
for batch in &requests {
|
||||
let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact();
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
|
||||
|
||||
@@ -14,7 +14,7 @@ pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use arc_swap::ArcSwap;
|
||||
use arc_swap::{ArcSwap, ArcSwapOption};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use chrono::{DateTime, Utc};
|
||||
@@ -23,6 +23,7 @@ use fail::fail_point;
|
||||
use handle::ShardTimelineId;
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
@@ -42,6 +43,7 @@ use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch},
|
||||
@@ -49,7 +51,9 @@ use tokio::{
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
fs_ext, pausable_failpoint,
|
||||
fs_ext,
|
||||
guard_arc_swap::GuardArcSwap,
|
||||
pausable_failpoint,
|
||||
postgres_client::PostgresClientProtocol,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
@@ -351,8 +355,8 @@ pub struct Timeline {
|
||||
// though let's keep them both for better error visibility.
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// When did we last calculate the partitioning? Make it pub to test cases.
|
||||
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
/// The repartitioning result. Allows a single writer and multiple readers.
|
||||
pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -433,6 +437,9 @@ pub struct Timeline {
|
||||
|
||||
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
|
||||
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
|
||||
/// If Some, collects GetPage metadata for an ongoing PageTrace.
|
||||
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
|
||||
}
|
||||
|
||||
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
||||
@@ -2335,7 +2342,8 @@ impl Timeline {
|
||||
// initial logical size is 0.
|
||||
LogicalSize::empty_initial()
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((
|
||||
|
||||
partitioning: GuardArcSwap::new((
|
||||
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
|
||||
Lsn(0),
|
||||
)),
|
||||
@@ -2380,6 +2388,8 @@ impl Timeline {
|
||||
attach_wal_lag_cooldown,
|
||||
|
||||
create_idempotency,
|
||||
|
||||
page_trace: Default::default(),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -4021,18 +4031,15 @@ impl Timeline {
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
let Ok(mut guard) = self.partitioning.try_write_guard() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
// Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
|
||||
// gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
|
||||
// heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called concurrently, this is rare and a retry should be fine"
|
||||
"repartition() called concurrently"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
@@ -4060,9 +4067,9 @@ impl Timeline {
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
}; // no partitioning for metadata keys for now
|
||||
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
|
||||
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
|
||||
let result = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
guard.write(result.clone());
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
|
||||
@@ -1776,7 +1776,10 @@ impl Timeline {
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
if cfg!(debug_assertions) {
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
if debug_mode {
|
||||
for (log_key, _, _) in full_history {
|
||||
assert_eq!(log_key, &key, "mismatched key");
|
||||
}
|
||||
@@ -1922,15 +1925,19 @@ impl Timeline {
|
||||
output
|
||||
}
|
||||
|
||||
let mut key_exists = false;
|
||||
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
|
||||
// TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
|
||||
records_since_last_image += split_for_lsn.len();
|
||||
let generate_image = if i == 0 && !has_ancestor {
|
||||
// Whether to produce an image into the final layer files
|
||||
let produce_image = if i == 0 && !has_ancestor {
|
||||
// We always generate images for the first batch (below horizon / lowest retain_lsn)
|
||||
true
|
||||
} else if i == batch_cnt - 1 {
|
||||
// Do not generate images for the last batch (above horizon)
|
||||
false
|
||||
} else if records_since_last_image == 0 {
|
||||
false
|
||||
} else if records_since_last_image >= delta_threshold_cnt {
|
||||
// Generate images when there are too many records
|
||||
true
|
||||
@@ -1945,29 +1952,45 @@ impl Timeline {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if let Some((_, _, val)) = replay_history.first() {
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
|
||||
|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
if replay_history.is_empty() && !key_exists {
|
||||
// The key does not exist at earlier LSN, we can skip this iteration.
|
||||
retention.push(Vec::new());
|
||||
continue;
|
||||
} else {
|
||||
key_exists = true;
|
||||
}
|
||||
if generate_image && records_since_last_image > 0 {
|
||||
let Some((_, _, val)) = replay_history.first() else {
|
||||
unreachable!("replay history should not be empty once it exists")
|
||||
};
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
});
|
||||
}
|
||||
// Whether to reconstruct the image. In debug mode, we will generate an image
|
||||
// at every retain_lsn to ensure data is not corrupted, but we won't put the
|
||||
// image into the final layer.
|
||||
let generate_image = produce_image || debug_mode;
|
||||
if produce_image {
|
||||
records_since_last_image = 0;
|
||||
let replay_history_for_debug = if cfg!(debug_assertions) {
|
||||
}
|
||||
let img_and_lsn = if generate_image {
|
||||
let replay_history_for_debug = if debug_mode {
|
||||
Some(replay_history.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let history = if produce_image {
|
||||
std::mem::take(&mut replay_history)
|
||||
} else {
|
||||
replay_history.clone()
|
||||
};
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
|
||||
@@ -2004,8 +2027,20 @@ impl Timeline {
|
||||
}
|
||||
records.reverse();
|
||||
let state = ValueReconstructState { img, records };
|
||||
let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
|
||||
// last batch does not generate image so i is always in range, unless we force generate
|
||||
// an image during testing
|
||||
let request_lsn = if i >= lsn_split_points.len() {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
lsn_split_points[i]
|
||||
};
|
||||
let img = self.reconstruct_value(key, request_lsn, state).await?;
|
||||
Some((request_lsn, img))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if produce_image {
|
||||
let (request_lsn, img) = img_and_lsn.unwrap();
|
||||
replay_history.push((key, request_lsn, Value::Image(img.clone())));
|
||||
retention.push(vec![(request_lsn, Value::Image(img))]);
|
||||
} else {
|
||||
@@ -2111,12 +2146,7 @@ impl Timeline {
|
||||
let mut compact_jobs = Vec::new();
|
||||
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
|
||||
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
|
||||
let ((dense_ks, sparse_ks), _) = {
|
||||
let Ok(partition) = self.partitioning.try_lock() else {
|
||||
bail!("failed to acquire partition lock during gc-compaction");
|
||||
};
|
||||
partition.clone()
|
||||
};
|
||||
let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
|
||||
// Truncate the key range to be within user specified compaction range.
|
||||
fn truncate_to(
|
||||
source_start: &Key,
|
||||
@@ -2273,6 +2303,8 @@ impl Timeline {
|
||||
let compact_key_range = job.compact_key_range;
|
||||
let compact_lsn_range = job.compact_lsn_range;
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
|
||||
|
||||
scopeguard::defer! {
|
||||
@@ -2398,7 +2430,7 @@ impl Timeline {
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(job_desc.gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
if debug_mode {
|
||||
assert_eq!(
|
||||
res,
|
||||
job_desc
|
||||
|
||||
@@ -911,7 +911,74 @@ pageserver_receive(shardno_t shard_no)
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
|
||||
shard->nresponses_received++;
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
|
||||
static NeonResponse *
|
||||
pageserver_try_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
if (shard->state != PS_Connected)
|
||||
return NULL;
|
||||
|
||||
Assert(pageserver_conn);
|
||||
|
||||
rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */);
|
||||
|
||||
if (rc == 0)
|
||||
return NULL;
|
||||
else if (rc > 0)
|
||||
{
|
||||
PG_TRY();
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
@@ -980,6 +1047,7 @@ page_server_api api =
|
||||
.send = pageserver_send,
|
||||
.flush = pageserver_flush,
|
||||
.receive = pageserver_receive,
|
||||
.try_receive = pageserver_try_receive,
|
||||
.disconnect = pageserver_disconnect_shard
|
||||
};
|
||||
|
||||
|
||||
@@ -51,26 +51,6 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint16 - get a binary 2-byte int from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
uint16
|
||||
pq_getmsgint16(StringInfo msg)
|
||||
{
|
||||
return pq_getmsgint(msg, 2);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32 - get a binary 4-byte int from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
uint32
|
||||
pq_getmsgint32(StringInfo msg)
|
||||
{
|
||||
return pq_getmsgint(msg, 4);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order
|
||||
* --------------------------------
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
#endif
|
||||
|
||||
bool HexDecodeString(uint8 *result, char *input, int nbytes);
|
||||
uint16 pq_getmsgint16(StringInfo msg);
|
||||
uint32 pq_getmsgint32(StringInfo msg);
|
||||
uint32 pq_getmsgint32_le(StringInfo msg);
|
||||
uint64 pq_getmsgint64_le(StringInfo msg);
|
||||
void pq_sendint32_le(StringInfo buf, uint32 i);
|
||||
|
||||
@@ -192,9 +192,29 @@ typedef uint16 shardno_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/*
|
||||
* Send this request to the PageServer associated with this shard.
|
||||
*/
|
||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||
/*
|
||||
* Blocking read for the next response of this shard.
|
||||
*
|
||||
* When a CANCEL signal is handled, the connection state will be
|
||||
* unmodified.
|
||||
*/
|
||||
NeonResponse *(*receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Try get the next response from the TCP buffers, if any.
|
||||
* Returns NULL when the data is not yet available.
|
||||
*/
|
||||
NeonResponse *(*try_receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Make sure all requests are sent to PageServer.
|
||||
*/
|
||||
bool (*flush) (shardno_t shard_no);
|
||||
/*
|
||||
* Disconnect from this pageserver shard.
|
||||
*/
|
||||
void (*disconnect) (shardno_t shard_no);
|
||||
} page_server_api;
|
||||
|
||||
|
||||
@@ -405,6 +405,56 @@ compact_prefetch_buffers(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there might be responses still in the TCP buffer, then
|
||||
* we should try to use those, so as to reduce any TCP backpressure
|
||||
* on the OS/PS side.
|
||||
*
|
||||
* This procedure handles that.
|
||||
*
|
||||
* Note that this is only valid as long as the only pipelined
|
||||
* operations in the TCP buffer are getPage@Lsn requests.
|
||||
*/
|
||||
static void
|
||||
prefetch_pump_state(void)
|
||||
{
|
||||
while (MyPState->ring_receive != MyPState->ring_flush)
|
||||
{
|
||||
NeonResponse *response;
|
||||
PrefetchRequest *slot;
|
||||
MemoryContext old;
|
||||
|
||||
slot = GetPrfSlot(MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = page_server->try_receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
|
||||
if (response == NULL)
|
||||
break;
|
||||
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
MyPState->ring_receive += 1;
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_RECEIVED;
|
||||
slot->response = response;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
readahead_buffer_resize(int newsize, void *extra)
|
||||
{
|
||||
@@ -2808,6 +2858,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
MyPState->ring_last <= ring_index);
|
||||
}
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2849,6 +2901,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
Assert(ring_index < MyPState->ring_unused &&
|
||||
MyPState->ring_last <= ring_index);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif /* PG_MAJORVERSION_NUM < 17 */
|
||||
@@ -2891,6 +2945,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
*/
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwriteback(reln, forknum, blocknum, nblocks);
|
||||
@@ -3145,6 +3201,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3282,6 +3340,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
|
||||
buffers, nblocks, read);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3450,6 +3510,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
@@ -3503,6 +3565,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
|
||||
@@ -3792,6 +3856,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdimmedsync(reln, forknum);
|
||||
|
||||
@@ -70,7 +70,6 @@ static bool SendAppendRequests(Safekeeper *sk);
|
||||
static bool RecvAppendResponses(Safekeeper *sk);
|
||||
static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
|
||||
static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
|
||||
static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version);
|
||||
static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
|
||||
static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
|
||||
static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
|
||||
@@ -82,8 +81,6 @@ static char *FormatSafekeeperState(Safekeeper *sk);
|
||||
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
static void UpdateDonorShmem(WalProposer *wp);
|
||||
static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
|
||||
static void MembershipConfigurationFree(MembershipConfiguration *mconf);
|
||||
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
@@ -140,21 +137,25 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
}
|
||||
wp->quorum = wp->n_safekeepers / 2 + 1;
|
||||
|
||||
if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
|
||||
wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
|
||||
wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
|
||||
|
||||
/* Fill the greeting package */
|
||||
wp->greetRequest.pam.tag = 'g';
|
||||
if (!wp->config->neon_tenant)
|
||||
wp_log(FATAL, "neon.tenant_id is not provided");
|
||||
wp->greetRequest.tenant_id = wp->config->neon_tenant;
|
||||
wp->greetRequest.tag = 'g';
|
||||
wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
|
||||
wp->greetRequest.pgVersion = PG_VERSION_NUM;
|
||||
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
||||
wp->greetRequest.systemId = wp->config->systemId;
|
||||
if (!wp->config->neon_timeline)
|
||||
wp_log(FATAL, "neon.timeline_id is not provided");
|
||||
wp->greetRequest.timeline_id = wp->config->neon_timeline;
|
||||
wp->greetRequest.pg_version = PG_VERSION_NUM;
|
||||
wp->greetRequest.system_id = wp->config->systemId;
|
||||
wp->greetRequest.wal_seg_size = wp->config->wal_segment_size;
|
||||
if (*wp->config->neon_timeline != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
|
||||
wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||
if (!wp->config->neon_tenant)
|
||||
wp_log(FATAL, "neon.tenant_id is not provided");
|
||||
if (*wp->config->neon_tenant != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
|
||||
wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||
|
||||
wp->greetRequest.timeline = wp->config->pgTimeline;
|
||||
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
|
||||
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
@@ -164,14 +165,12 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
void
|
||||
WalProposerFree(WalProposer *wp)
|
||||
{
|
||||
MembershipConfigurationFree(&wp->mconf);
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
Assert(sk->outbuf.data != NULL);
|
||||
pfree(sk->outbuf.data);
|
||||
MembershipConfigurationFree(&sk->greetResponse.mconf);
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
@@ -309,7 +308,6 @@ ShutdownConnection(Safekeeper *sk)
|
||||
sk->state = SS_OFFLINE;
|
||||
sk->streamingAt = InvalidXLogRecPtr;
|
||||
|
||||
MembershipConfigurationFree(&sk->greetResponse.mconf);
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
@@ -600,14 +598,11 @@ static void
|
||||
SendStartWALPush(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
#define CMD_LEN 512
|
||||
char cmd[CMD_LEN];
|
||||
|
||||
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
|
||||
if (!wp->api.conn_send_query(sk, cmd))
|
||||
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
||||
{
|
||||
wp_log(WARNING, "failed to send %s query to safekeeper %s:%s: %s",
|
||||
cmd, sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -663,33 +658,23 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
|
||||
/*
|
||||
* Start handshake: first of all send information about the
|
||||
* walproposer. After sending, we wait on SS_HANDSHAKE_RECV for
|
||||
* safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
|
||||
* a response to finish the handshake.
|
||||
*/
|
||||
static void
|
||||
SendProposerGreeting(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf);
|
||||
|
||||
wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml);
|
||||
pfree(mconf_toml);
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest,
|
||||
&sk->outbuf, wp->config->proto_version);
|
||||
|
||||
/*
|
||||
* On failure, logging & resetting the connection is handled. We just need
|
||||
* to handle the control flow.
|
||||
*/
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
|
||||
BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV);
|
||||
}
|
||||
|
||||
static void
|
||||
RecvAcceptorGreeting(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
char *mconf_toml;
|
||||
|
||||
/*
|
||||
* If our reading doesn't immediately succeed, any necessary error
|
||||
@@ -700,10 +685,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
||||
return;
|
||||
|
||||
mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf);
|
||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT,
|
||||
sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
|
||||
pfree(mconf_toml);
|
||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
|
||||
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_VOTING;
|
||||
@@ -725,9 +707,12 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
wp->propTerm++;
|
||||
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
|
||||
wp->voteRequest.pam.tag = 'v';
|
||||
wp->voteRequest.generation = wp->mconf.generation;
|
||||
wp->voteRequest.term = wp->propTerm;
|
||||
wp->voteRequest = (VoteRequest)
|
||||
{
|
||||
.tag = 'v',
|
||||
.term = wp->propTerm
|
||||
};
|
||||
memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN);
|
||||
}
|
||||
}
|
||||
else if (sk->greetResponse.term > wp->propTerm)
|
||||
@@ -774,14 +759,12 @@ SendVoteRequest(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest,
|
||||
&sk->outbuf, wp->config->proto_version);
|
||||
|
||||
/* We have quorum for voting, send our vote request */
|
||||
wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
|
||||
wp->voteRequest.generation, wp->voteRequest.term);
|
||||
wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
|
||||
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
|
||||
return;
|
||||
|
||||
/* If successful, wait for read-ready with SS_WAIT_VERDICT */
|
||||
}
|
||||
|
||||
@@ -795,12 +778,11 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
return;
|
||||
|
||||
wp_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
|
||||
sk->voteResponse.voteGiven,
|
||||
GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||
|
||||
/*
|
||||
* In case of acceptor rejecting our vote, bail out, but only if either it
|
||||
@@ -865,9 +847,9 @@ HandleElectedProposer(WalProposer *wp)
|
||||
* otherwise we must be sync-safekeepers and we have nothing to do then.
|
||||
*
|
||||
* Proceeding is not only pointless but harmful, because we'd give
|
||||
* safekeepers term history starting with 0/0. These hacks will go away
|
||||
* once we disable implicit timeline creation on safekeepers and create it
|
||||
* with non zero LSN from the start.
|
||||
* safekeepers term history starting with 0/0. These hacks will go away once
|
||||
* we disable implicit timeline creation on safekeepers and create it with
|
||||
* non zero LSN from the start.
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
@@ -960,6 +942,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->propEpochStartLsn = InvalidXLogRecPtr;
|
||||
wp->donorEpoch = 0;
|
||||
wp->truncateLsn = InvalidXLogRecPtr;
|
||||
wp->timelineStartLsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
@@ -976,6 +959,20 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->donor = i;
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
|
||||
if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
/* timelineStartLsn should be the same everywhere or unknown */
|
||||
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
|
||||
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
||||
{
|
||||
wp_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
}
|
||||
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -998,11 +995,22 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
|
||||
{
|
||||
wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
|
||||
if (wp->timelineStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
|
||||
}
|
||||
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
}
|
||||
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
|
||||
|
||||
Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);
|
||||
/*
|
||||
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so
|
||||
* it should never be zero at this point, if we know timelineStartLsn.
|
||||
*
|
||||
* timelineStartLsn can be zero only on the first syncSafekeepers run.
|
||||
*/
|
||||
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
|
||||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
|
||||
|
||||
/*
|
||||
* We will be generating WAL since propEpochStartLsn, so we should set
|
||||
@@ -1044,11 +1052,10 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if last_log_term on the node which
|
||||
* gave the highest vote (i.e. point where we are going to start
|
||||
* writing) actually had been won by me; plain restart of
|
||||
* walproposer not intervened by concurrent compute which wrote
|
||||
* WAL is ok.
|
||||
* However, allow to proceed if last_log_term on the node which gave
|
||||
* the highest vote (i.e. point where we are going to start writing)
|
||||
* actually had been won by me; plain restart of walproposer not
|
||||
* intervened by concurrent compute which wrote WAL is ok.
|
||||
*
|
||||
* This avoids compute crash after manual term_bump.
|
||||
*/
|
||||
@@ -1118,8 +1125,14 @@ SendProposerElected(Safekeeper *sk)
|
||||
{
|
||||
/* safekeeper is empty or no common point, start from the beginning */
|
||||
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries);
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
|
||||
|
||||
/*
|
||||
* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
|
||||
* is created manually (test_s3_wal_replay)
|
||||
*/
|
||||
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1144,19 +1157,29 @@ SendProposerElected(Safekeeper *sk)
|
||||
|
||||
Assert(sk->startStreamingAt <= wp->availableLsn);
|
||||
|
||||
msg.apm.tag = 'e';
|
||||
msg.generation = wp->mconf.generation;
|
||||
msg.tag = 'e';
|
||||
msg.term = wp->propTerm;
|
||||
msg.startStreamingAt = sk->startStreamingAt;
|
||||
msg.termHistory = &wp->propTermHistory;
|
||||
msg.timelineStartLsn = wp->timelineStartLsn;
|
||||
|
||||
lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
|
||||
wp_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
|
||||
sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt),
|
||||
lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
pq_sendint64_le(&sk->outbuf, msg.tag);
|
||||
pq_sendint64_le(&sk->outbuf, msg.term);
|
||||
pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
|
||||
pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
|
||||
for (int i = 0; i < msg.termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
|
||||
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
|
||||
}
|
||||
pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version);
|
||||
if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
|
||||
return;
|
||||
|
||||
@@ -1222,13 +1245,14 @@ static void
|
||||
PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
|
||||
{
|
||||
Assert(endLsn >= beginLsn);
|
||||
req->apm.tag = 'a';
|
||||
req->generation = wp->mconf.generation;
|
||||
req->tag = 'a';
|
||||
req->term = wp->propTerm;
|
||||
req->epochStartLsn = wp->propEpochStartLsn;
|
||||
req->beginLsn = beginLsn;
|
||||
req->endLsn = endLsn;
|
||||
req->commitLsn = wp->commitLsn;
|
||||
req->truncateLsn = wp->truncateLsn;
|
||||
req->proposerId = wp->greetRequest.proposerId;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1329,8 +1353,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
/* write AppendRequest header */
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version);
|
||||
/* prepare for reading WAL into the outbuf */
|
||||
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
sk->active_state = SS_ACTIVE_READ_WAL;
|
||||
}
|
||||
@@ -1343,17 +1366,14 @@ SendAppendRequests(Safekeeper *sk)
|
||||
req = &sk->appendRequest;
|
||||
req_len = req->endLsn - req->beginLsn;
|
||||
|
||||
/*
|
||||
* We send zero sized AppenRequests as heartbeats; don't wal_read
|
||||
* for these.
|
||||
*/
|
||||
/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
|
||||
if (req_len > 0)
|
||||
{
|
||||
switch (wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req_len,
|
||||
&errmsg))
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req_len,
|
||||
&errmsg))
|
||||
{
|
||||
case NEON_WALREAD_SUCCESS:
|
||||
break;
|
||||
@@ -1361,7 +1381,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
sk->host, sk->port, errmsg);
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1449,11 +1469,11 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
* Term has changed to higher one, probably another compute is
|
||||
* running. If this is the case we could PANIC as well because
|
||||
* likely it inserted some data and our basebackup is unsuitable
|
||||
* anymore. However, we also bump term manually (term_bump
|
||||
* endpoint) on safekeepers for migration purposes, in this case
|
||||
* we do want compute to stay alive. So restart walproposer with
|
||||
* FATAL instead of panicking; if basebackup is spoiled next
|
||||
* election will notice this.
|
||||
* anymore. However, we also bump term manually (term_bump endpoint)
|
||||
* on safekeepers for migration purposes, in this case we do want
|
||||
* compute to stay alive. So restart walproposer with FATAL instead
|
||||
* of panicking; if basebackup is spoiled next election will notice
|
||||
* this.
|
||||
*/
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
sk->host, sk->port,
|
||||
@@ -1729,208 +1749,6 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
}
|
||||
}
|
||||
|
||||
/* Serialize MembershipConfiguration into buf. */
|
||||
static void
|
||||
MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf)
|
||||
{
|
||||
uint32 i;
|
||||
|
||||
pq_sendint32(buf, mconf->generation);
|
||||
|
||||
pq_sendint32(buf, mconf->members.len);
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
pq_sendint64(buf, mconf->members.m[i].node_id);
|
||||
pq_send_ascii_string(buf, mconf->members.m[i].host);
|
||||
pq_sendint16(buf, mconf->members.m[i].port);
|
||||
}
|
||||
|
||||
/*
|
||||
* There is no special mark for absent new_members; zero members in
|
||||
* invalid, so zero len means absent.
|
||||
*/
|
||||
pq_sendint32(buf, mconf->new_members.len);
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
pq_sendint64(buf, mconf->new_members.m[i].node_id);
|
||||
pq_send_ascii_string(buf, mconf->new_members.m[i].host);
|
||||
pq_sendint16(buf, mconf->new_members.m[i].port);
|
||||
}
|
||||
}
|
||||
|
||||
/* Serialize proposer -> acceptor message into buf using specified version */
|
||||
static void
|
||||
PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version)
|
||||
{
|
||||
/* both version are supported currently until we fully migrate to 3 */
|
||||
Assert(proto_version == 3 || proto_version == 2);
|
||||
|
||||
resetStringInfo(buf);
|
||||
|
||||
if (proto_version == 3)
|
||||
{
|
||||
/*
|
||||
* v2 sends structs for some messages as is, so commonly send tag only
|
||||
* for v3
|
||||
*/
|
||||
pq_sendint8(buf, msg->tag);
|
||||
|
||||
switch (msg->tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
ProposerGreeting *m = (ProposerGreeting *) msg;
|
||||
|
||||
pq_send_ascii_string(buf, m->tenant_id);
|
||||
pq_send_ascii_string(buf, m->timeline_id);
|
||||
MembershipConfigurationSerialize(&m->mconf, buf);
|
||||
pq_sendint32(buf, m->pg_version);
|
||||
pq_sendint64(buf, m->system_id);
|
||||
pq_sendint32(buf, m->wal_seg_size);
|
||||
break;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
VoteRequest *m = (VoteRequest *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
break;
|
||||
|
||||
}
|
||||
case 'e':
|
||||
{
|
||||
ProposerElected *m = (ProposerElected *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
pq_sendint64(buf, m->startStreamingAt);
|
||||
pq_sendint32(buf, m->termHistory->n_entries);
|
||||
for (uint32 i = 0; i < m->termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64(buf, m->termHistory->entries[i].term);
|
||||
pq_sendint64(buf, m->termHistory->entries[i].lsn);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'a':
|
||||
{
|
||||
/*
|
||||
* Note: this serializes only AppendRequestHeader, caller
|
||||
* is expected to append WAL data later.
|
||||
*/
|
||||
AppendRequestHeader *m = (AppendRequestHeader *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
pq_sendint64(buf, m->beginLsn);
|
||||
pq_sendint64(buf, m->endLsn);
|
||||
pq_sendint64(buf, m->commitLsn);
|
||||
pq_sendint64(buf, m->truncateLsn);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (proto_version == 2)
|
||||
{
|
||||
switch (msg->tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
ProposerGreeting *m = (ProposerGreeting *) msg;
|
||||
ProposerGreetingV2 greetRequestV2;
|
||||
|
||||
/* Fill also v2 struct. */
|
||||
greetRequestV2.tag = 'g';
|
||||
greetRequestV2.protocolVersion = proto_version;
|
||||
greetRequestV2.pgVersion = m->pg_version;
|
||||
|
||||
/*
|
||||
* v3 removed this field because it's easier to pass as
|
||||
* libq or START_WAL_PUSH options
|
||||
*/
|
||||
memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId));
|
||||
greetRequestV2.systemId = wp->config->systemId;
|
||||
if (*m->timeline_id != '\0' &&
|
||||
!HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16))
|
||||
wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id);
|
||||
if (*m->tenant_id != '\0' &&
|
||||
!HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16))
|
||||
wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id);
|
||||
|
||||
greetRequestV2.timeline = wp->config->pgTimeline;
|
||||
greetRequestV2.walSegSize = wp->config->wal_segment_size;
|
||||
|
||||
pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2));
|
||||
break;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
VoteRequest *m = (VoteRequest *) msg;
|
||||
VoteRequestV2 voteRequestV2;
|
||||
|
||||
voteRequestV2.tag = m->pam.tag;
|
||||
voteRequestV2.term = m->term;
|
||||
/* removed field */
|
||||
memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId));
|
||||
pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2));
|
||||
break;
|
||||
}
|
||||
case 'e':
|
||||
{
|
||||
ProposerElected *m = (ProposerElected *) msg;
|
||||
|
||||
pq_sendint64_le(buf, m->apm.tag);
|
||||
pq_sendint64_le(buf, m->term);
|
||||
pq_sendint64_le(buf, m->startStreamingAt);
|
||||
pq_sendint32_le(buf, m->termHistory->n_entries);
|
||||
for (int i = 0; i < m->termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].term);
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
|
||||
}
|
||||
pq_sendint64_le(buf, 0); /* removed timeline_start_lsn */
|
||||
break;
|
||||
}
|
||||
case 'a':
|
||||
|
||||
/*
|
||||
* Note: this serializes only AppendRequestHeader, caller is
|
||||
* expected to append WAL data later.
|
||||
*/
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
AppendRequestHeader *m = (AppendRequestHeader *) msg;
|
||||
AppendRequestHeaderV2 appendRequestHeaderV2;
|
||||
|
||||
appendRequestHeaderV2.tag = m->apm.tag;
|
||||
appendRequestHeaderV2.term = m->term;
|
||||
appendRequestHeaderV2.epochStartLsn = 0; /* removed field */
|
||||
appendRequestHeaderV2.beginLsn = m->beginLsn;
|
||||
appendRequestHeaderV2.endLsn = m->endLsn;
|
||||
appendRequestHeaderV2.commitLsn = m->commitLsn;
|
||||
appendRequestHeaderV2.truncateLsn = m->truncateLsn;
|
||||
/* removed field */
|
||||
memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId));
|
||||
|
||||
pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
|
||||
}
|
||||
return;
|
||||
}
|
||||
wp_log(FATAL, "unexpected proto_version %d", proto_version);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to read CopyData message from i'th safekeeper, resetting connection on
|
||||
* failure.
|
||||
@@ -1960,37 +1778,6 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Deserialize membership configuration from buf to mconf. */
|
||||
static void
|
||||
MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf)
|
||||
{
|
||||
uint32 i;
|
||||
|
||||
mconf->generation = pq_getmsgint32(buf);
|
||||
mconf->members.len = pq_getmsgint32(buf);
|
||||
mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len);
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
const char *buf_host;
|
||||
|
||||
mconf->members.m[i].node_id = pq_getmsgint64(buf);
|
||||
buf_host = pq_getmsgrawstring(buf);
|
||||
strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host));
|
||||
mconf->members.m[i].port = pq_getmsgint16(buf);
|
||||
}
|
||||
mconf->new_members.len = pq_getmsgint32(buf);
|
||||
mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len);
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
const char *buf_host;
|
||||
|
||||
mconf->new_members.m[i].node_id = pq_getmsgint64(buf);
|
||||
buf_host = pq_getmsgrawstring(buf);
|
||||
strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host));
|
||||
mconf->new_members.m[i].port = pq_getmsgint16(buf);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read next message with known type into provided struct, by reading a CopyData
|
||||
* block from the safekeeper's postgres connection, returning whether the read
|
||||
@@ -1999,8 +1786,6 @@ MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo bu
|
||||
* If the read needs more polling, we return 'false' and keep the state
|
||||
* unmodified, waiting until it becomes read-ready to try again. If it fully
|
||||
* failed, a warning is emitted and the connection is reset.
|
||||
*
|
||||
* Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields.
|
||||
*/
|
||||
static bool
|
||||
AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
@@ -2009,153 +1794,82 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
|
||||
char *buf;
|
||||
int buf_size;
|
||||
uint8 tag;
|
||||
uint64 tag;
|
||||
StringInfoData s;
|
||||
|
||||
if (!(AsyncRead(sk, &buf, &buf_size)))
|
||||
return false;
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
|
||||
/* parse it */
|
||||
s.data = buf;
|
||||
s.len = buf_size;
|
||||
s.maxlen = buf_size;
|
||||
s.cursor = 0;
|
||||
|
||||
if (wp->config->proto_version == 3)
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
tag = pq_getmsgbyte(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->nodeId = pq_getmsgint64(&s);
|
||||
MembershipConfigurationDeserialize(&msg->mconf, &s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
|
||||
msg->generation = pq_getmsgint32(&s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
msg->voteGiven = pq_getmsgbyte(&s);
|
||||
msg->flushLsn = pq_getmsgint64(&s);
|
||||
msg->truncateLsn = pq_getmsgint64(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (uint32 i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64(&s);
|
||||
}
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->generation = pq_getmsgint32(&s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
msg->flushLsn = pq_getmsgint64(&s);
|
||||
msg->commitLsn = pq_getmsgint64(&s);
|
||||
msg->hs.ts = pq_getmsgint64(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
{
|
||||
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
else if (wp->config->proto_version == 2)
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
switch (tag)
|
||||
{
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
case 'g':
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->nodeId = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->voteGiven = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->truncateLsn = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (int i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->nodeId = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
|
||||
}
|
||||
msg->timelineStartLsn = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->voteGiven = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->truncateLsn = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (int i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
|
||||
}
|
||||
pq_getmsgint64_le(&s); /* timelineStartLsn */
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->commitLsn = pq_getmsgint64_le(&s);
|
||||
msg->hs.ts = pq_getmsgint64_le(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->commitLsn = pq_getmsgint64_le(&s);
|
||||
msg->hs.ts = pq_getmsgint64_le(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
default:
|
||||
{
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2531,45 +2245,3 @@ FormatEvents(WalProposer *wp, uint32 events)
|
||||
|
||||
return (char *) &return_str;
|
||||
}
|
||||
|
||||
/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */
|
||||
static char *
|
||||
MembershipConfigurationToString(MembershipConfiguration *mconf)
|
||||
{
|
||||
StringInfoData s;
|
||||
uint32 i;
|
||||
|
||||
initStringInfo(&s);
|
||||
appendStringInfo(&s, "{gen = %u", mconf->generation);
|
||||
appendStringInfoString(&s, ", members = [");
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
appendStringInfoString(&s, ", ");
|
||||
appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id);
|
||||
appendStringInfo(&s, ", host = %s", mconf->members.m[i].host);
|
||||
appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port);
|
||||
}
|
||||
appendStringInfo(&s, "], new_members = [");
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
appendStringInfoString(&s, ", ");
|
||||
appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id);
|
||||
appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host);
|
||||
appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port);
|
||||
}
|
||||
appendStringInfoString(&s, "]}");
|
||||
return s.data;
|
||||
}
|
||||
|
||||
static void
|
||||
MembershipConfigurationFree(MembershipConfiguration *mconf)
|
||||
{
|
||||
if (mconf->members.m)
|
||||
pfree(mconf->members.m);
|
||||
mconf->members.m = NULL;
|
||||
if (mconf->new_members.m)
|
||||
pfree(mconf->new_members.m);
|
||||
mconf->new_members.m = NULL;
|
||||
}
|
||||
|
||||
@@ -12,6 +12,9 @@
|
||||
#include "neon_walreader.h"
|
||||
#include "pagestore_client.h"
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
#define MAX_SAFEKEEPERS 32
|
||||
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL
|
||||
* message */
|
||||
@@ -140,71 +143,12 @@ typedef uint64 term_t;
|
||||
/* neon storage node id */
|
||||
typedef uint64 NNodeId;
|
||||
|
||||
/*
|
||||
* Number uniquely identifying safekeeper membership configuration.
|
||||
* This and following structs pair ones in membership.rs.
|
||||
*/
|
||||
typedef uint32 Generation;
|
||||
|
||||
typedef struct SafekeeperId
|
||||
{
|
||||
NNodeId node_id;
|
||||
char host[MAXCONNINFO];
|
||||
uint16 port;
|
||||
} SafekeeperId;
|
||||
|
||||
/* Set of safekeepers. */
|
||||
typedef struct MemberSet
|
||||
{
|
||||
uint32 len; /* number of members */
|
||||
SafekeeperId *m; /* ids themselves */
|
||||
} MemberSet;
|
||||
|
||||
/* Timeline safekeeper membership configuration. */
|
||||
typedef struct MembershipConfiguration
|
||||
{
|
||||
Generation generation;
|
||||
MemberSet members;
|
||||
/* Has 0 n_members in non joint conf. */
|
||||
MemberSet new_members;
|
||||
} MembershipConfiguration;
|
||||
|
||||
/*
|
||||
* Proposer <-> Acceptor messaging.
|
||||
*/
|
||||
|
||||
typedef struct ProposerAcceptorMessage
|
||||
{
|
||||
uint8 tag;
|
||||
} ProposerAcceptorMessage;
|
||||
|
||||
/* Initial Proposer -> Acceptor message */
|
||||
typedef struct ProposerGreeting
|
||||
{
|
||||
ProposerAcceptorMessage pam; /* message tag */
|
||||
|
||||
/*
|
||||
* tenant/timeline ids as C strings with standard hex notation for ease of
|
||||
* printing. In principle they are not strictly needed as ttid is also
|
||||
* passed as libpq options.
|
||||
*/
|
||||
char *tenant_id;
|
||||
char *timeline_id;
|
||||
/* Full conf is carried to allow safekeeper switch */
|
||||
MembershipConfiguration mconf;
|
||||
|
||||
/*
|
||||
* pg_version and wal_seg_size are used for timeline creation until we
|
||||
* fully migrate to doing externally. systemId is only used as a sanity
|
||||
* cross check.
|
||||
*/
|
||||
uint32 pg_version; /* in PG_VERSION_NUM format */
|
||||
uint64 system_id; /* Postgres system identifier. */
|
||||
uint32 wal_seg_size;
|
||||
} ProposerGreeting;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct ProposerGreetingV2
|
||||
{
|
||||
uint64 tag; /* message tag */
|
||||
uint32 protocolVersion; /* proposer-safekeeper protocol version */
|
||||
@@ -215,42 +159,32 @@ typedef struct ProposerGreetingV2
|
||||
uint8 tenant_id[16];
|
||||
TimeLineID timeline;
|
||||
uint32 walSegSize;
|
||||
} ProposerGreetingV2;
|
||||
} ProposerGreeting;
|
||||
|
||||
typedef struct AcceptorProposerMessage
|
||||
{
|
||||
uint8 tag;
|
||||
uint64 tag;
|
||||
} AcceptorProposerMessage;
|
||||
|
||||
/*
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for,
|
||||
* its node id and configuration.
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for.
|
||||
*/
|
||||
typedef struct AcceptorGreeting
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
NNodeId nodeId;
|
||||
MembershipConfiguration mconf;
|
||||
term_t term;
|
||||
NNodeId nodeId;
|
||||
} AcceptorGreeting;
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor vote request.
|
||||
*/
|
||||
typedef struct VoteRequest
|
||||
{
|
||||
ProposerAcceptorMessage pam; /* message tag */
|
||||
Generation generation; /* membership conf generation */
|
||||
term_t term;
|
||||
} VoteRequest;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct VoteRequestV2
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} VoteRequestV2;
|
||||
} VoteRequest;
|
||||
|
||||
/* Element of term switching chain. */
|
||||
typedef struct TermSwitchEntry
|
||||
@@ -269,15 +203,8 @@ typedef struct TermHistory
|
||||
typedef struct VoteResponse
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
|
||||
/*
|
||||
* Membership conf generation. It's not strictly required because on
|
||||
* mismatch safekeeper is expected to ERROR the connection, but let's
|
||||
* sanity check it.
|
||||
*/
|
||||
Generation generation;
|
||||
term_t term;
|
||||
uint8 voteGiven;
|
||||
uint64 voteGiven;
|
||||
|
||||
/*
|
||||
* Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
@@ -287,6 +214,7 @@ typedef struct VoteResponse
|
||||
XLogRecPtr truncateLsn; /* minimal LSN which may be needed for*
|
||||
* recovery of some safekeeper */
|
||||
TermHistory termHistory;
|
||||
XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
|
||||
} VoteResponse;
|
||||
|
||||
/*
|
||||
@@ -295,37 +223,20 @@ typedef struct VoteResponse
|
||||
*/
|
||||
typedef struct ProposerElected
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
Generation generation; /* membership conf generation */
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
/* proposer will send since this point */
|
||||
XLogRecPtr startStreamingAt;
|
||||
/* history of term switches up to this proposer */
|
||||
TermHistory *termHistory;
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
} ProposerElected;
|
||||
|
||||
/*
|
||||
* Header of request with WAL message sent from proposer to safekeeper.
|
||||
*/
|
||||
typedef struct AppendRequestHeader
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
Generation generation; /* membership conf generation */
|
||||
term_t term; /* term of the proposer */
|
||||
XLogRecPtr beginLsn; /* start position of message in WAL */
|
||||
XLogRecPtr endLsn; /* end position of message in WAL */
|
||||
XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */
|
||||
|
||||
/*
|
||||
* minimal LSN which may be needed for recovery of some safekeeper (end
|
||||
* lsn + 1 of last chunk streamed to everyone)
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
/* in the AppendRequest message, WAL data follows */
|
||||
} AppendRequestHeader;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct AppendRequestHeaderV2
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term; /* term of the proposer */
|
||||
@@ -345,8 +256,7 @@ typedef struct AppendRequestHeaderV2
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
/* in the AppendRequest message, WAL data follows */
|
||||
} AppendRequestHeaderV2;
|
||||
} AppendRequestHeader;
|
||||
|
||||
/*
|
||||
* Hot standby feedback received from replica
|
||||
@@ -399,13 +309,6 @@ typedef struct AppendResponse
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
|
||||
/*
|
||||
* Membership conf generation. It's not strictly required because on
|
||||
* mismatch safekeeper is expected to ERROR the connection, but let's
|
||||
* sanity check it.
|
||||
*/
|
||||
Generation generation;
|
||||
|
||||
/*
|
||||
* Current term of the safekeeper; if it is higher than proposer's, the
|
||||
* compute is out of date.
|
||||
@@ -741,8 +644,6 @@ typedef struct WalProposerConfig
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
TimeLineID pgTimeline;
|
||||
|
||||
int proto_version;
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void *callback_data;
|
||||
#endif
|
||||
@@ -755,14 +656,11 @@ typedef struct WalProposerConfig
|
||||
typedef struct WalProposer
|
||||
{
|
||||
WalProposerConfig *config;
|
||||
/* Current walproposer membership configuration */
|
||||
MembershipConfiguration mconf;
|
||||
int n_safekeepers;
|
||||
|
||||
/* (n_safekeepers / 2) + 1 */
|
||||
int quorum;
|
||||
|
||||
/* Number of occupied slots in safekeepers[] */
|
||||
int n_safekeepers;
|
||||
Safekeeper safekeeper[MAX_SAFEKEEPERS];
|
||||
|
||||
/* WAL has been generated up to this point */
|
||||
@@ -772,7 +670,6 @@ typedef struct WalProposer
|
||||
XLogRecPtr commitLsn;
|
||||
|
||||
ProposerGreeting greetRequest;
|
||||
ProposerGreetingV2 greetRequestV2;
|
||||
|
||||
/* Vote request for safekeeper */
|
||||
VoteRequest voteRequest;
|
||||
|
||||
@@ -155,16 +155,6 @@ pq_getmsgend(StringInfo msg)
|
||||
ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_sendbytes - append raw data to a StringInfo buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
void
|
||||
pq_sendbytes(StringInfo buf, const void *data, int datalen)
|
||||
{
|
||||
/* use variant that maintains a trailing null-byte, out of caution */
|
||||
appendBinaryStringInfo(buf, data, datalen);
|
||||
}
|
||||
|
||||
/*
|
||||
* Produce a C-string representation of a TimestampTz.
|
||||
|
||||
@@ -59,11 +59,9 @@
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
/* GUCs */
|
||||
char *wal_acceptors_list = "";
|
||||
int wal_acceptor_reconnect_timeout = 1000;
|
||||
int wal_acceptor_connection_timeout = 10000;
|
||||
int safekeeper_proto_version = 2;
|
||||
|
||||
/* Set to true in the walproposer bgw. */
|
||||
static bool am_walproposer;
|
||||
@@ -128,7 +126,6 @@ init_walprop_config(bool syncSafekeepers)
|
||||
else
|
||||
walprop_config.systemId = 0;
|
||||
walprop_config.pgTimeline = walprop_pg_get_timeline_id();
|
||||
walprop_config.proto_version = safekeeper_proto_version;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -222,37 +219,25 @@ nwp_register_gucs(void)
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_MS,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable(
|
||||
"neon.safekeeper_proto_version",
|
||||
"Version of compute <-> safekeeper protocol.",
|
||||
"Used while migrating from 2 to 3.",
|
||||
&safekeeper_proto_version,
|
||||
2, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
{
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
|
||||
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
|
||||
wpg_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
|
||||
coma = strchr(coma, ',');
|
||||
safekeepers[n_safekeepers - 1] = curr_sk;
|
||||
safekeepers[n_safekeepers-1] = curr_sk;
|
||||
|
||||
if (coma != NULL)
|
||||
{
|
||||
if (coma != NULL) {
|
||||
*coma++ = '\0';
|
||||
}
|
||||
}
|
||||
@@ -267,10 +252,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
static bool
|
||||
safekeepers_cmp(char *old, char *new)
|
||||
{
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
@@ -307,8 +292,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
if (!am_walproposer)
|
||||
return;
|
||||
|
||||
if (!newval)
|
||||
{
|
||||
if (!newval) {
|
||||
/* should never happen */
|
||||
wpg_log(FATAL, "neon.safekeepers is empty");
|
||||
}
|
||||
@@ -317,11 +301,11 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
newval_copy = pstrdup(newval);
|
||||
oldval = pstrdup(wal_acceptors_list);
|
||||
|
||||
/*
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit
|
||||
* and thus remove this delay. XXX: If you change anything here, sync with
|
||||
* test_safekeepers_reconfigure_reorder.
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit and
|
||||
* thus remove this delay.
|
||||
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (!safekeepers_cmp(oldval, newval_copy))
|
||||
{
|
||||
@@ -470,8 +454,7 @@ backpressure_throttling_impl(void)
|
||||
memcpy(new_status, old_status, len);
|
||||
snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
|
||||
set_ps_display(new_status);
|
||||
new_status[len] = '\0'; /* truncate off " backpressure ..." to later
|
||||
* reset the ps */
|
||||
new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
|
||||
|
||||
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
|
||||
start = GetCurrentTimestamp();
|
||||
@@ -638,7 +621,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
|
||||
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
|
||||
LSN_FORMAT_ARGS(startpos));
|
||||
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
|
||||
cmd.timeline = wp->config->pgTimeline;
|
||||
cmd.timeline = wp->greetRequest.timeline;
|
||||
cmd.startpoint = startpos;
|
||||
StartProposerReplication(wp, &cmd);
|
||||
}
|
||||
@@ -1980,11 +1963,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
|
||||
FullTransactionId xmin = hsFeedback.xmin;
|
||||
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
|
||||
FullTransactionId next_xid = ReadNextFullTransactionId();
|
||||
|
||||
/*
|
||||
* Page server is updating nextXid in checkpoint each 1024
|
||||
* transactions, so feedback xmin can be actually larger then nextXid
|
||||
* and function TransactionIdInRecentPast return false in this case,
|
||||
* Page server is updating nextXid in checkpoint each 1024 transactions,
|
||||
* so feedback xmin can be actually larger then nextXid and
|
||||
* function TransactionIdInRecentPast return false in this case,
|
||||
* preventing update of slot's xmin.
|
||||
*/
|
||||
if (FullTransactionIdPrecedes(next_xid, xmin))
|
||||
|
||||
@@ -26,6 +26,7 @@ hex.workspace = true
|
||||
humantime.workspace = true
|
||||
http.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
futures.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
@@ -39,6 +40,7 @@ scopeguard.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
smallvec.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
@@ -63,6 +65,7 @@ storage_broker.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
utils.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -207,6 +207,13 @@ struct Args {
|
||||
/// Also defines interval for eviction retries.
|
||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
|
||||
eviction_min_resident: Duration,
|
||||
/// Enable fanning out WAL to different shards from the same reader
|
||||
#[arg(long)]
|
||||
wal_reader_fanout: bool,
|
||||
/// Only fan out the WAL reader if the absoulte delta between the new requested position
|
||||
/// and the current position of the reader is smaller than this value.
|
||||
#[arg(long)]
|
||||
max_delta_for_fanout: Option<u64>,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
@@ -370,6 +377,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
control_file_save_interval: args.control_file_save_interval,
|
||||
partial_backup_concurrency: args.partial_backup_concurrency,
|
||||
eviction_min_resident: args.eviction_min_resident,
|
||||
wal_reader_fanout: args.wal_reader_fanout,
|
||||
max_delta_for_fanout: args.max_delta_for_fanout,
|
||||
});
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
|
||||
@@ -195,7 +195,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
remote_consistent_lsn: inmem.remote_consistent_lsn,
|
||||
peers: tli.get_peers(conf).await,
|
||||
walsenders: tli.get_walsenders().get_all(),
|
||||
walsenders: tli.get_walsenders().get_all_public(),
|
||||
walreceivers: tli.get_walreceivers().get_all(),
|
||||
};
|
||||
json_response(StatusCode::OK, status)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
use anyhow::Context;
|
||||
use postgres_backend::QueryError;
|
||||
use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -133,10 +133,10 @@ async fn send_proposer_elected(
|
||||
let history = TermHistory(history_entries);
|
||||
|
||||
let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term,
|
||||
start_streaming_at: lsn,
|
||||
term_history: history,
|
||||
timeline_start_lsn: lsn,
|
||||
});
|
||||
|
||||
tli.process_msg(&proposer_elected_request).await?;
|
||||
@@ -170,12 +170,13 @@ pub async fn append_logical_message(
|
||||
|
||||
let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: msg.term,
|
||||
term_start_lsn: begin_lsn,
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
commit_lsn,
|
||||
truncate_lsn: msg.truncate_lsn,
|
||||
proposer_uuid: [0u8; 16],
|
||||
},
|
||||
wal_data,
|
||||
});
|
||||
|
||||
@@ -108,6 +108,8 @@ pub struct SafeKeeperConf {
|
||||
pub control_file_save_interval: Duration,
|
||||
pub partial_backup_concurrency: usize,
|
||||
pub eviction_min_resident: Duration,
|
||||
pub wal_reader_fanout: bool,
|
||||
pub max_delta_for_fanout: Option<u64>,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -150,6 +152,8 @@ impl SafeKeeperConf {
|
||||
control_file_save_interval: Duration::from_secs(1),
|
||||
partial_backup_concurrency: 1,
|
||||
eviction_min_resident: Duration::ZERO,
|
||||
wal_reader_fanout: false,
|
||||
max_delta_for_fanout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,9 +12,9 @@ use metrics::{
|
||||
pow2_buckets,
|
||||
proto::MetricFamily,
|
||||
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
|
||||
Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
|
||||
IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
|
||||
register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
|
||||
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_ffi::XLogSegNo;
|
||||
@@ -211,6 +211,14 @@ pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_receivers")
|
||||
});
|
||||
pub static WAL_READERS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"safekeeper_wal_readers",
|
||||
"Number of active WAL readers (may serve pageservers or other safekeepers)",
|
||||
&["kind", "target"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_receivers")
|
||||
});
|
||||
pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
|
||||
// Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
|
||||
// full queues respectively.
|
||||
@@ -443,6 +451,7 @@ pub struct FullTimelineInfo {
|
||||
pub timeline_is_active: bool,
|
||||
pub num_computes: u32,
|
||||
pub last_removed_segno: XLogSegNo,
|
||||
pub interpreted_wal_reader_tasks: usize,
|
||||
|
||||
pub epoch_start_lsn: Lsn,
|
||||
pub mem_state: TimelineMemState,
|
||||
@@ -472,6 +481,7 @@ pub struct TimelineCollector {
|
||||
disk_usage: GenericGaugeVec<AtomicU64>,
|
||||
acceptor_term: GenericGaugeVec<AtomicU64>,
|
||||
written_wal_bytes: GenericGaugeVec<AtomicU64>,
|
||||
interpreted_wal_reader_tasks: GenericGaugeVec<AtomicU64>,
|
||||
written_wal_seconds: GaugeVec,
|
||||
flushed_wal_seconds: GaugeVec,
|
||||
collect_timeline_metrics: Gauge,
|
||||
@@ -670,6 +680,16 @@ impl TimelineCollector {
|
||||
.unwrap();
|
||||
descs.extend(active_timelines_count.desc().into_iter().cloned());
|
||||
|
||||
let interpreted_wal_reader_tasks = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_interpreted_wal_reader_tasks",
|
||||
"Number of active interpreted wal reader tasks, grouped by timeline",
|
||||
),
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned());
|
||||
|
||||
TimelineCollector {
|
||||
global_timelines,
|
||||
descs,
|
||||
@@ -693,6 +713,7 @@ impl TimelineCollector {
|
||||
collect_timeline_metrics,
|
||||
timelines_count,
|
||||
active_timelines_count,
|
||||
interpreted_wal_reader_tasks,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -721,6 +742,7 @@ impl Collector for TimelineCollector {
|
||||
self.disk_usage.reset();
|
||||
self.acceptor_term.reset();
|
||||
self.written_wal_bytes.reset();
|
||||
self.interpreted_wal_reader_tasks.reset();
|
||||
self.written_wal_seconds.reset();
|
||||
self.flushed_wal_seconds.reset();
|
||||
|
||||
@@ -782,6 +804,9 @@ impl Collector for TimelineCollector {
|
||||
self.written_wal_bytes
|
||||
.with_label_values(labels)
|
||||
.set(tli.wal_storage.write_wal_bytes);
|
||||
self.interpreted_wal_reader_tasks
|
||||
.with_label_values(labels)
|
||||
.set(tli.interpreted_wal_reader_tasks as u64);
|
||||
self.written_wal_seconds
|
||||
.with_label_values(labels)
|
||||
.set(tli.wal_storage.write_wal_seconds);
|
||||
@@ -834,6 +859,7 @@ impl Collector for TimelineCollector {
|
||||
mfs.extend(self.disk_usage.collect());
|
||||
mfs.extend(self.acceptor_term.collect());
|
||||
mfs.extend(self.written_wal_bytes.collect());
|
||||
mfs.extend(self.interpreted_wal_reader_tasks.collect());
|
||||
mfs.extend(self.written_wal_seconds.collect());
|
||||
mfs.extend(self.flushed_wal_seconds.collect());
|
||||
|
||||
|
||||
@@ -281,7 +281,7 @@ impl SafekeeperPostgresHandler {
|
||||
tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
|
||||
_ = timeline_cancel.cancelled() => {
|
||||
return Err(CopyStreamHandlerEnd::Cancelled);
|
||||
}
|
||||
@@ -342,8 +342,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
"start handshake with walproposer {} sysid {}",
|
||||
self.peer_addr, greeting.system_id,
|
||||
"start handshake with walproposer {} sysid {} timeline {}",
|
||||
self.peer_addr, greeting.system_id, greeting.tli,
|
||||
);
|
||||
let server_info = ServerInfo {
|
||||
pg_version: greeting.pg_version,
|
||||
@@ -459,7 +459,6 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
mut reply_rx: Receiver<AcceptorProposerMessage>,
|
||||
mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
|
||||
proto_version: u32,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
|
||||
@@ -497,7 +496,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
};
|
||||
|
||||
buf.clear();
|
||||
msg.serialize(&mut buf, proto_version)?;
|
||||
msg.serialize(&mut buf)?;
|
||||
pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use std::{fmt, pin::pin};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::StreamExt;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use safekeeper_api::models::{PeerInfo, TimelineStatus};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
||||
@@ -268,10 +267,7 @@ async fn recover(
|
||||
);
|
||||
|
||||
// Now understand our term history.
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
});
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
|
||||
let vote_response = match tli
|
||||
.process_msg(&vote_request)
|
||||
.await
|
||||
@@ -306,10 +302,10 @@ async fn recover(
|
||||
|
||||
// truncate WAL locally
|
||||
let pe = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
start_streaming_at: last_common_point.lsn,
|
||||
term_history: donor_th,
|
||||
timeline_start_lsn: Lsn::INVALID,
|
||||
});
|
||||
// Successful ProposerElected handling always returns None. If term changed,
|
||||
// we'll find out that during the streaming. Note: it is expected to get
|
||||
@@ -438,12 +434,13 @@ async fn network_io(
|
||||
match msg {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
term_start_lsn: Lsn::INVALID, // unused
|
||||
begin_lsn: Lsn(xlog_data.wal_start()),
|
||||
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
|
||||
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
|
||||
truncate_lsn: Lsn::INVALID, // do not attempt to advance
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let ar = AppendRequest {
|
||||
h: ar_hdr,
|
||||
|
||||
@@ -5,11 +5,6 @@ use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
|
||||
use safekeeper_api::membership;
|
||||
use safekeeper_api::membership::Generation;
|
||||
use safekeeper_api::membership::MemberSet;
|
||||
use safekeeper_api::membership::SafekeeperId;
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use safekeeper_api::models::HotStandbyFeedback;
|
||||
use safekeeper_api::Term;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -17,7 +12,6 @@ use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
use std::fmt;
|
||||
use std::io::Read;
|
||||
use std::str::FromStr;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
|
||||
use tracing::*;
|
||||
@@ -35,8 +29,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub const SK_PROTO_VERSION_2: u32 = 2;
|
||||
pub const SK_PROTO_VERSION_3: u32 = 3;
|
||||
pub const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -63,28 +56,8 @@ impl TermHistory {
|
||||
TermHistory(Vec::new())
|
||||
}
|
||||
|
||||
// Parse TermHistory as n_entries followed by TermLsn pairs in network order.
|
||||
// Parse TermHistory as n_entries followed by TermLsn pairs
|
||||
pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
|
||||
let n_entries = bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "TermHistory misses len")?;
|
||||
let mut res = Vec::with_capacity(n_entries as usize);
|
||||
for i in 0..n_entries {
|
||||
let term = bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("TermHistory pos {} misses term", i))?;
|
||||
let lsn = bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("TermHistory pos {} misses lsn", i))?
|
||||
.into();
|
||||
res.push(TermLsn { term, lsn })
|
||||
}
|
||||
Ok(TermHistory(res))
|
||||
}
|
||||
|
||||
// Parse TermHistory as n_entries followed by TermLsn pairs in LE order.
|
||||
// TODO remove once v2 protocol is fully dropped.
|
||||
pub fn from_bytes_le(bytes: &mut Bytes) -> Result<TermHistory> {
|
||||
if bytes.remaining() < 4 {
|
||||
bail!("TermHistory misses len");
|
||||
}
|
||||
@@ -224,18 +197,6 @@ impl AcceptorState {
|
||||
/// Initial Proposer -> Acceptor message
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ProposerGreeting {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mconf: membership::Configuration,
|
||||
/// Postgres server version
|
||||
pub pg_version: u32,
|
||||
pub system_id: SystemId,
|
||||
pub wal_seg_size: u32,
|
||||
}
|
||||
|
||||
/// V2 of the message; exists as a struct because we (de)serialized it as is.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ProposerGreetingV2 {
|
||||
/// proposer-acceptor protocol version
|
||||
pub protocol_version: u32,
|
||||
/// Postgres server version
|
||||
@@ -252,35 +213,27 @@ pub struct ProposerGreetingV2 {
|
||||
/// (acceptor voted for).
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct AcceptorGreeting {
|
||||
node_id: NodeId,
|
||||
mconf: membership::Configuration,
|
||||
term: u64,
|
||||
node_id: NodeId,
|
||||
}
|
||||
|
||||
/// Vote request sent from proposer to safekeepers
|
||||
#[derive(Debug)]
|
||||
pub struct VoteRequest {
|
||||
pub generation: Generation,
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// V2 of the message; exists as a struct because we (de)serialized it as is.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct VoteRequestV2 {
|
||||
pub struct VoteRequest {
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// Vote itself, sent from safekeeper to proposer
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct VoteResponse {
|
||||
generation: Generation, // membership conf generation
|
||||
pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
vote_given: bool,
|
||||
vote_given: u64, // fixme u64 due to padding
|
||||
// Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
// proposer to choose the most advanced one.
|
||||
pub flush_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
pub term_history: TermHistory,
|
||||
timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -289,10 +242,10 @@ pub struct VoteResponse {
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
pub struct ProposerElected {
|
||||
pub generation: Generation, // membership conf generation
|
||||
pub term: Term,
|
||||
pub start_streaming_at: Lsn,
|
||||
pub term_history: TermHistory,
|
||||
pub timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Request with WAL message sent from proposer to safekeeper. Along the way it
|
||||
@@ -304,22 +257,6 @@ pub struct AppendRequest {
|
||||
}
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct AppendRequestHeader {
|
||||
pub generation: Generation, // membership conf generation
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
/// start position of message in WAL
|
||||
pub begin_lsn: Lsn,
|
||||
/// end position of message in WAL
|
||||
pub end_lsn: Lsn,
|
||||
/// LSN committed by quorum of safekeepers
|
||||
pub commit_lsn: Lsn,
|
||||
/// minimal LSN which may be needed by proposer to perform recovery of some safekeeper
|
||||
pub truncate_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// V2 of the message; exists as a struct because we (de)serialized it as is.
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct AppendRequestHeaderV2 {
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
// TODO: remove this field from the protocol, it in unused -- LSN of term
|
||||
@@ -340,9 +277,6 @@ pub struct AppendRequestHeaderV2 {
|
||||
/// Report safekeeper state to proposer
|
||||
#[derive(Debug, Serialize, Clone)]
|
||||
pub struct AppendResponse {
|
||||
// Membership conf generation. Not strictly required because on mismatch
|
||||
// connection is reset, but let's sanity check it.
|
||||
generation: Generation,
|
||||
// Current term of the safekeeper; if it is higher than proposer's, the
|
||||
// compute is out of date.
|
||||
pub term: Term,
|
||||
@@ -359,9 +293,8 @@ pub struct AppendResponse {
|
||||
}
|
||||
|
||||
impl AppendResponse {
|
||||
fn term_only(generation: Generation, term: Term) -> AppendResponse {
|
||||
fn term_only(term: Term) -> AppendResponse {
|
||||
AppendResponse {
|
||||
generation,
|
||||
term,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
@@ -382,317 +315,72 @@ pub enum ProposerAcceptorMessage {
|
||||
FlushWAL,
|
||||
}
|
||||
|
||||
/// Augment Bytes with fallible get_uN where N is number of bytes methods.
|
||||
/// All reads are in network (big endian) order.
|
||||
trait BytesF {
|
||||
fn get_u8_f(&mut self) -> Result<u8>;
|
||||
fn get_u16_f(&mut self) -> Result<u16>;
|
||||
fn get_u32_f(&mut self) -> Result<u32>;
|
||||
fn get_u64_f(&mut self) -> Result<u64>;
|
||||
}
|
||||
|
||||
impl BytesF for Bytes {
|
||||
fn get_u8_f(&mut self) -> Result<u8> {
|
||||
if self.is_empty() {
|
||||
bail!("no bytes left, expected 1");
|
||||
}
|
||||
Ok(self.get_u8())
|
||||
}
|
||||
fn get_u16_f(&mut self) -> Result<u16> {
|
||||
if self.is_empty() {
|
||||
bail!("no bytes left, expected 2");
|
||||
}
|
||||
Ok(self.get_u16())
|
||||
}
|
||||
fn get_u32_f(&mut self) -> Result<u32> {
|
||||
if self.remaining() < 4 {
|
||||
bail!("only {} bytes left, expected 4", self.remaining());
|
||||
}
|
||||
Ok(self.get_u32())
|
||||
}
|
||||
fn get_u64_f(&mut self) -> Result<u64> {
|
||||
if self.remaining() < 8 {
|
||||
bail!("only {} bytes left, expected 8", self.remaining());
|
||||
}
|
||||
Ok(self.get_u64())
|
||||
}
|
||||
}
|
||||
|
||||
impl ProposerAcceptorMessage {
|
||||
/// Read cstring from Bytes.
|
||||
fn get_cstr(buf: &mut Bytes) -> Result<String> {
|
||||
let pos = buf
|
||||
.iter()
|
||||
.position(|x| *x == 0)
|
||||
.ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?;
|
||||
let result = buf.split_to(pos);
|
||||
buf.advance(1); // drop the null terminator
|
||||
match std::str::from_utf8(&result) {
|
||||
Ok(s) => Ok(s.to_string()),
|
||||
Err(e) => bail!("invalid utf8 in cstring: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read membership::Configuration from Bytes.
|
||||
fn get_mconf(buf: &mut Bytes) -> Result<membership::Configuration> {
|
||||
let generation = buf.get_u32_f().with_context(|| "reading generation")?;
|
||||
let members_len = buf.get_u32_f().with_context(|| "reading members_len")?;
|
||||
// Main member set must have at least someone in valid configuration.
|
||||
// Empty conf is allowed until we fully migrate.
|
||||
if generation != INVALID_GENERATION && members_len == 0 {
|
||||
bail!("empty members_len");
|
||||
}
|
||||
let mut members = MemberSet::empty();
|
||||
for i in 0..members_len {
|
||||
let id = buf
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("reading member {} node_id", i))?;
|
||||
let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?;
|
||||
let pg_port = buf
|
||||
.get_u16_f()
|
||||
.with_context(|| format!("reading member {} port", i))?;
|
||||
let sk = SafekeeperId {
|
||||
id: NodeId(id),
|
||||
host,
|
||||
pg_port,
|
||||
};
|
||||
members.add(sk)?;
|
||||
}
|
||||
let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?;
|
||||
// Non joint conf.
|
||||
if new_members_len == 0 {
|
||||
Ok(membership::Configuration {
|
||||
generation,
|
||||
members,
|
||||
new_members: None,
|
||||
})
|
||||
} else {
|
||||
let mut new_members = MemberSet::empty();
|
||||
for i in 0..new_members_len {
|
||||
let id = buf
|
||||
.get_u64_f()
|
||||
.with_context(|| format!("reading new member {} node_id", i))?;
|
||||
let host = Self::get_cstr(buf)
|
||||
.with_context(|| format!("reading new member {} host", i))?;
|
||||
let pg_port = buf
|
||||
.get_u16_f()
|
||||
.with_context(|| format!("reading new member {} port", i))?;
|
||||
let sk = SafekeeperId {
|
||||
id: NodeId(id),
|
||||
host,
|
||||
pg_port,
|
||||
};
|
||||
new_members.add(sk)?;
|
||||
}
|
||||
Ok(membership::Configuration {
|
||||
generation,
|
||||
members,
|
||||
new_members: Some(new_members),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse proposer message.
|
||||
pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
|
||||
if proto_version == SK_PROTO_VERSION_3 {
|
||||
if msg_bytes.is_empty() {
|
||||
bail!("ProposerAcceptorMessage is not complete: missing tag");
|
||||
pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
|
||||
if proto_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"incompatible protocol version {}, expected {}",
|
||||
proto_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
|
||||
match tag {
|
||||
'g' => {
|
||||
let msg = ProposerGreeting::des_from(&mut stream)?;
|
||||
Ok(ProposerAcceptorMessage::Greeting(msg))
|
||||
}
|
||||
let tag = msg_bytes.get_u8_f().with_context(|| {
|
||||
"ProposerAcceptorMessage is not complete: missing tag".to_string()
|
||||
})? as char;
|
||||
match tag {
|
||||
'g' => {
|
||||
let tenant_id_str =
|
||||
Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?;
|
||||
let tenant_id = TenantId::from_str(&tenant_id_str)?;
|
||||
let timeline_id_str =
|
||||
Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?;
|
||||
let timeline_id = TimelineId::from_str(&timeline_id_str)?;
|
||||
let mconf = Self::get_mconf(&mut msg_bytes)?;
|
||||
let pg_version = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading pg_version")?;
|
||||
let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?;
|
||||
let wal_seg_size = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading wal_seg_size")?;
|
||||
let g = ProposerGreeting {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mconf,
|
||||
pg_version,
|
||||
system_id,
|
||||
wal_seg_size,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Greeting(g))
|
||||
}
|
||||
'v' => {
|
||||
let generation = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading generation")?;
|
||||
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
|
||||
let v = VoteRequest { generation, term };
|
||||
Ok(ProposerAcceptorMessage::VoteRequest(v))
|
||||
}
|
||||
'e' => {
|
||||
let generation = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading generation")?;
|
||||
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
|
||||
let start_streaming_at: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading start_streaming_at")?
|
||||
.into();
|
||||
let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
|
||||
let msg = ProposerElected {
|
||||
generation,
|
||||
term,
|
||||
start_streaming_at,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
}
|
||||
'a' => {
|
||||
let generation = msg_bytes
|
||||
.get_u32_f()
|
||||
.with_context(|| "reading generation")?;
|
||||
let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
|
||||
let begin_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading begin_lsn")?
|
||||
.into();
|
||||
let end_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading end_lsn")?
|
||||
.into();
|
||||
let commit_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading commit_lsn")?
|
||||
.into();
|
||||
let truncate_lsn: Lsn = msg_bytes
|
||||
.get_u64_f()
|
||||
.with_context(|| "reading truncate_lsn")?
|
||||
.into();
|
||||
let hdr = AppendRequestHeader {
|
||||
generation,
|
||||
term,
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
commit_lsn,
|
||||
truncate_lsn,
|
||||
};
|
||||
let rec_size = hdr
|
||||
.end_lsn
|
||||
.checked_sub(hdr.begin_lsn)
|
||||
.context("begin_lsn > end_lsn in AppendRequest")?
|
||||
.0 as usize;
|
||||
if rec_size > MAX_SEND_SIZE {
|
||||
bail!(
|
||||
"AppendRequest is longer than MAX_SEND_SIZE ({})",
|
||||
MAX_SEND_SIZE
|
||||
);
|
||||
}
|
||||
if msg_bytes.remaining() < rec_size {
|
||||
bail!(
|
||||
"reading WAL: only {} bytes left, wanted {}",
|
||||
msg_bytes.remaining(),
|
||||
rec_size
|
||||
);
|
||||
}
|
||||
let wal_data = msg_bytes.copy_to_bytes(rec_size);
|
||||
let msg = AppendRequest { h: hdr, wal_data };
|
||||
|
||||
Ok(ProposerAcceptorMessage::AppendRequest(msg))
|
||||
}
|
||||
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
|
||||
'v' => {
|
||||
let msg = VoteRequest::des_from(&mut stream)?;
|
||||
Ok(ProposerAcceptorMessage::VoteRequest(msg))
|
||||
}
|
||||
// TODO remove proto_version == 3 after converting all msgs
|
||||
} else if proto_version == SK_PROTO_VERSION_2 || proto_version == SK_PROTO_VERSION_3 {
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
|
||||
match tag {
|
||||
'g' => {
|
||||
let msgv2 = ProposerGreetingV2::des_from(&mut stream)?;
|
||||
let g = ProposerGreeting {
|
||||
tenant_id: msgv2.tenant_id,
|
||||
timeline_id: msgv2.timeline_id,
|
||||
mconf: membership::Configuration {
|
||||
generation: INVALID_GENERATION,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
},
|
||||
pg_version: msgv2.pg_version,
|
||||
system_id: msgv2.system_id,
|
||||
wal_seg_size: msgv2.wal_seg_size,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Greeting(g))
|
||||
'e' => {
|
||||
let mut msg_bytes = stream.into_inner();
|
||||
if msg_bytes.remaining() < 16 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
'v' => {
|
||||
let msg = VoteRequestV2::des_from(&mut stream)?;
|
||||
let v = VoteRequest {
|
||||
generation: INVALID_GENERATION,
|
||||
term: msg.term,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::VoteRequest(v))
|
||||
let term = msg_bytes.get_u64_le();
|
||||
let start_streaming_at = msg_bytes.get_u64_le().into();
|
||||
let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
|
||||
if msg_bytes.remaining() < 8 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
'e' => {
|
||||
let mut msg_bytes = stream.into_inner();
|
||||
if msg_bytes.remaining() < 16 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
let term = msg_bytes.get_u64_le();
|
||||
let start_streaming_at = msg_bytes.get_u64_le().into();
|
||||
let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?;
|
||||
if msg_bytes.remaining() < 8 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
let _timeline_start_lsn = msg_bytes.get_u64_le();
|
||||
let msg = ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term,
|
||||
start_streaming_at,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
}
|
||||
'a' => {
|
||||
// read header followed by wal data
|
||||
let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?;
|
||||
let hdr = AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: hdrv2.term,
|
||||
begin_lsn: hdrv2.begin_lsn,
|
||||
end_lsn: hdrv2.end_lsn,
|
||||
commit_lsn: hdrv2.commit_lsn,
|
||||
truncate_lsn: hdrv2.truncate_lsn,
|
||||
};
|
||||
let rec_size = hdr
|
||||
.end_lsn
|
||||
.checked_sub(hdr.begin_lsn)
|
||||
.context("begin_lsn > end_lsn in AppendRequest")?
|
||||
.0 as usize;
|
||||
if rec_size > MAX_SEND_SIZE {
|
||||
bail!(
|
||||
"AppendRequest is longer than MAX_SEND_SIZE ({})",
|
||||
MAX_SEND_SIZE
|
||||
);
|
||||
}
|
||||
|
||||
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
|
||||
stream.read_exact(&mut wal_data_vec)?;
|
||||
let wal_data = Bytes::from(wal_data_vec);
|
||||
|
||||
let msg = AppendRequest { h: hdr, wal_data };
|
||||
|
||||
Ok(ProposerAcceptorMessage::AppendRequest(msg))
|
||||
}
|
||||
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
|
||||
let timeline_start_lsn = msg_bytes.get_u64_le().into();
|
||||
let msg = ProposerElected {
|
||||
term,
|
||||
start_streaming_at,
|
||||
timeline_start_lsn,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
}
|
||||
} else {
|
||||
bail!("unsupported protocol version {}", proto_version);
|
||||
'a' => {
|
||||
// read header followed by wal data
|
||||
let hdr = AppendRequestHeader::des_from(&mut stream)?;
|
||||
let rec_size = hdr
|
||||
.end_lsn
|
||||
.checked_sub(hdr.begin_lsn)
|
||||
.context("begin_lsn > end_lsn in AppendRequest")?
|
||||
.0 as usize;
|
||||
if rec_size > MAX_SEND_SIZE {
|
||||
bail!(
|
||||
"AppendRequest is longer than MAX_SEND_SIZE ({})",
|
||||
MAX_SEND_SIZE
|
||||
);
|
||||
}
|
||||
|
||||
let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
|
||||
stream.read_exact(&mut wal_data_vec)?;
|
||||
let wal_data = Bytes::from(wal_data_vec);
|
||||
let msg = AppendRequest { h: hdr, wal_data };
|
||||
|
||||
Ok(ProposerAcceptorMessage::AppendRequest(msg))
|
||||
}
|
||||
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -706,21 +394,36 @@ impl ProposerAcceptorMessage {
|
||||
// We explicitly list all fields, to draw attention here when new fields are added.
|
||||
let mut size = BASE_SIZE;
|
||||
size += match self {
|
||||
Self::Greeting(_) => 0,
|
||||
Self::Greeting(ProposerGreeting {
|
||||
protocol_version: _,
|
||||
pg_version: _,
|
||||
proposer_id: _,
|
||||
system_id: _,
|
||||
timeline_id: _,
|
||||
tenant_id: _,
|
||||
tli: _,
|
||||
wal_seg_size: _,
|
||||
}) => 0,
|
||||
|
||||
Self::VoteRequest(_) => 0,
|
||||
Self::VoteRequest(VoteRequest { term: _ }) => 0,
|
||||
|
||||
Self::Elected(_) => 0,
|
||||
Self::Elected(ProposerElected {
|
||||
term: _,
|
||||
start_streaming_at: _,
|
||||
term_history: _,
|
||||
timeline_start_lsn: _,
|
||||
}) => 0,
|
||||
|
||||
Self::AppendRequest(AppendRequest {
|
||||
h:
|
||||
AppendRequestHeader {
|
||||
generation: _,
|
||||
term: _,
|
||||
term_start_lsn: _,
|
||||
begin_lsn: _,
|
||||
end_lsn: _,
|
||||
commit_lsn: _,
|
||||
truncate_lsn: _,
|
||||
proposer_uuid: _,
|
||||
},
|
||||
wal_data,
|
||||
}) => wal_data.len(),
|
||||
@@ -728,12 +431,13 @@ impl ProposerAcceptorMessage {
|
||||
Self::NoFlushAppendRequest(AppendRequest {
|
||||
h:
|
||||
AppendRequestHeader {
|
||||
generation: _,
|
||||
term: _,
|
||||
term_start_lsn: _,
|
||||
begin_lsn: _,
|
||||
end_lsn: _,
|
||||
commit_lsn: _,
|
||||
truncate_lsn: _,
|
||||
proposer_uuid: _,
|
||||
},
|
||||
wal_data,
|
||||
}) => wal_data.len(),
|
||||
@@ -754,118 +458,45 @@ pub enum AcceptorProposerMessage {
|
||||
}
|
||||
|
||||
impl AcceptorProposerMessage {
|
||||
fn put_cstr(buf: &mut BytesMut, s: &str) {
|
||||
buf.put_slice(s.as_bytes());
|
||||
buf.put_u8(0); // null terminator
|
||||
}
|
||||
|
||||
/// Serialize membership::Configuration into buf.
|
||||
fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) {
|
||||
buf.put_u32(mconf.generation);
|
||||
buf.put_u32(mconf.members.m.len() as u32);
|
||||
for sk in &mconf.members.m {
|
||||
buf.put_u64(sk.id.0);
|
||||
Self::put_cstr(buf, &sk.host);
|
||||
buf.put_u16(sk.pg_port);
|
||||
}
|
||||
if let Some(ref new_members) = mconf.new_members {
|
||||
buf.put_u32(new_members.m.len() as u32);
|
||||
for sk in &new_members.m {
|
||||
buf.put_u64(sk.id.0);
|
||||
Self::put_cstr(buf, &sk.host);
|
||||
buf.put_u16(sk.pg_port);
|
||||
}
|
||||
} else {
|
||||
buf.put_u32(0);
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize acceptor -> proposer message.
|
||||
pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> {
|
||||
if proto_version == SK_PROTO_VERSION_3 {
|
||||
match self {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u8('g' as u8);
|
||||
buf.put_u64(msg.node_id.0);
|
||||
Self::serialize_mconf(buf, &msg.mconf);
|
||||
buf.put_u64(msg.term)
|
||||
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
||||
match self {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u64_le('g' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.node_id.0);
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
buf.put_u64_le('v' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.vote_given);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.truncate_lsn.into());
|
||||
buf.put_u32_le(msg.term_history.0.len() as u32);
|
||||
for e in &msg.term_history.0 {
|
||||
buf.put_u64_le(e.term);
|
||||
buf.put_u64_le(e.lsn.into());
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
buf.put_u8('v' as u8);
|
||||
buf.put_u32(msg.generation);
|
||||
buf.put_u64(msg.term);
|
||||
buf.put_u8(msg.vote_given as u8);
|
||||
buf.put_u64(msg.flush_lsn.into());
|
||||
buf.put_u64(msg.truncate_lsn.into());
|
||||
buf.put_u32(msg.term_history.0.len() as u32);
|
||||
for e in &msg.term_history.0 {
|
||||
buf.put_u64(e.term);
|
||||
buf.put_u64(e.lsn.into());
|
||||
}
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
buf.put_u8('a' as u8);
|
||||
buf.put_u32(msg.generation);
|
||||
buf.put_u64(msg.term);
|
||||
buf.put_u64(msg.flush_lsn.into());
|
||||
buf.put_u64(msg.commit_lsn.into());
|
||||
buf.put_i64(msg.hs_feedback.ts);
|
||||
buf.put_u64(msg.hs_feedback.xmin);
|
||||
buf.put_u64(msg.hs_feedback.catalog_xmin);
|
||||
buf.put_u64_le(msg.timeline_start_lsn.into());
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
buf.put_u64_le('a' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.commit_lsn.into());
|
||||
buf.put_i64_le(msg.hs_feedback.ts);
|
||||
buf.put_u64_le(msg.hs_feedback.xmin);
|
||||
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
||||
|
||||
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
|
||||
// if it is not present.
|
||||
if let Some(ref msg) = msg.pageserver_feedback {
|
||||
msg.serialize(buf);
|
||||
}
|
||||
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
|
||||
// if it is not present.
|
||||
if let Some(ref msg) = msg.pageserver_feedback {
|
||||
msg.serialize(buf);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
// TODO remove 3 after converting all msgs
|
||||
} else if proto_version == SK_PROTO_VERSION_2 {
|
||||
match self {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u64_le('g' as u64);
|
||||
// v2 didn't have mconf and fields were reordered
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.node_id.0);
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
// v2 didn't have generation, had u64 vote_given and timeline_start_lsn
|
||||
buf.put_u64_le('v' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.vote_given as u64);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.truncate_lsn.into());
|
||||
buf.put_u32_le(msg.term_history.0.len() as u32);
|
||||
for e in &msg.term_history.0 {
|
||||
buf.put_u64_le(e.term);
|
||||
buf.put_u64_le(e.lsn.into());
|
||||
}
|
||||
// removed timeline_start_lsn
|
||||
buf.put_u64_le(0);
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
// v2 didn't have generation
|
||||
buf.put_u64_le('a' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.commit_lsn.into());
|
||||
buf.put_i64_le(msg.hs_feedback.ts);
|
||||
buf.put_u64_le(msg.hs_feedback.xmin);
|
||||
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
||||
|
||||
// AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
|
||||
// if it is not present.
|
||||
if let Some(ref msg) = msg.pageserver_feedback {
|
||||
msg.serialize(buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("unsupported protocol version {}", proto_version);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -962,6 +593,14 @@ where
|
||||
&mut self,
|
||||
msg: &ProposerGreeting,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
// Check protocol compatibility
|
||||
if msg.protocol_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"incompatible protocol version {}, expected {}",
|
||||
msg.protocol_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
/* Postgres major version mismatch is treated as fatal error
|
||||
* because safekeepers parse WAL headers and the format
|
||||
* may change between versions.
|
||||
@@ -1016,16 +655,15 @@ where
|
||||
self.state.finish_change(&state).await?;
|
||||
}
|
||||
|
||||
let apg = AcceptorGreeting {
|
||||
node_id: self.node_id,
|
||||
mconf: self.state.mconf.clone(),
|
||||
term: self.state.acceptor_state.term,
|
||||
};
|
||||
info!(
|
||||
"processed greeting {:?} from walproposer, sending {:?}",
|
||||
msg, apg
|
||||
"processed greeting from walproposer {}, sending term {:?}",
|
||||
msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
|
||||
self.state.acceptor_state.term
|
||||
);
|
||||
Ok(Some(AcceptorProposerMessage::Greeting(apg)))
|
||||
Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
|
||||
term: self.state.acceptor_state.term,
|
||||
node_id: self.node_id,
|
||||
})))
|
||||
}
|
||||
|
||||
/// Give vote for the given term, if we haven't done that previously.
|
||||
@@ -1046,12 +684,12 @@ where
|
||||
self.wal_store.flush_wal().await?;
|
||||
// initialize with refusal
|
||||
let mut resp = VoteResponse {
|
||||
generation: self.state.mconf.generation,
|
||||
term: self.state.acceptor_state.term,
|
||||
vote_given: false,
|
||||
vote_given: false as u64,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
truncate_lsn: self.state.inmem.peer_horizon_lsn,
|
||||
term_history: self.get_term_history(),
|
||||
timeline_start_lsn: self.state.timeline_start_lsn,
|
||||
};
|
||||
if self.state.acceptor_state.term < msg.term {
|
||||
let mut state = self.state.start_change();
|
||||
@@ -1060,16 +698,15 @@ where
|
||||
self.state.finish_change(&state).await?;
|
||||
|
||||
resp.term = self.state.acceptor_state.term;
|
||||
resp.vote_given = true;
|
||||
resp.vote_given = true as u64;
|
||||
}
|
||||
info!("processed {:?}: sending {:?}", msg, &resp);
|
||||
info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
|
||||
Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
|
||||
}
|
||||
|
||||
/// Form AppendResponse from current state.
|
||||
fn append_response(&self) -> AppendResponse {
|
||||
let ar = AppendResponse {
|
||||
generation: self.state.mconf.generation,
|
||||
term: self.state.acceptor_state.term,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
commit_lsn: self.state.commit_lsn,
|
||||
@@ -1168,22 +805,18 @@ where
|
||||
// Here we learn initial LSN for the first time, set fields
|
||||
// interested in that.
|
||||
|
||||
if let Some(start_lsn) = msg.term_history.0.first() {
|
||||
if state.timeline_start_lsn == Lsn(0) {
|
||||
// Remember point where WAL begins globally. In the future it
|
||||
// will be intialized immediately on timeline creation.
|
||||
state.timeline_start_lsn = start_lsn.lsn;
|
||||
info!(
|
||||
"setting timeline_start_lsn to {:?}",
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
if state.timeline_start_lsn == Lsn(0) {
|
||||
// Remember point where WAL begins globally.
|
||||
state.timeline_start_lsn = msg.timeline_start_lsn;
|
||||
info!(
|
||||
"setting timeline_start_lsn to {:?}",
|
||||
state.timeline_start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
if state.peer_horizon_lsn == Lsn(0) {
|
||||
// Update peer_horizon_lsn as soon as we know where timeline starts.
|
||||
// It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
|
||||
state.peer_horizon_lsn = state.timeline_start_lsn;
|
||||
state.peer_horizon_lsn = msg.timeline_start_lsn;
|
||||
}
|
||||
if state.local_start_lsn == Lsn(0) {
|
||||
state.local_start_lsn = msg.start_streaming_at;
|
||||
@@ -1263,10 +896,7 @@ where
|
||||
|
||||
// If our term is higher, immediately refuse the message.
|
||||
if self.state.acceptor_state.term > msg.h.term {
|
||||
let resp = AppendResponse::term_only(
|
||||
self.state.mconf.generation,
|
||||
self.state.acceptor_state.term,
|
||||
);
|
||||
let resp = AppendResponse::term_only(self.state.acceptor_state.term);
|
||||
return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
|
||||
}
|
||||
|
||||
@@ -1294,8 +924,10 @@ where
|
||||
);
|
||||
}
|
||||
|
||||
// Now we know that we are in the same term as the proposer, process the
|
||||
// message.
|
||||
// Now we know that we are in the same term as the proposer,
|
||||
// processing the message.
|
||||
|
||||
self.state.inmem.proposer_uuid = msg.h.proposer_uuid;
|
||||
|
||||
// do the job
|
||||
if !msg.wal_data.is_empty() {
|
||||
|
||||
@@ -1,100 +1,330 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::future::Either;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use postgres_ffi::waldecoder::WalDecodeError;
|
||||
use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
|
||||
use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::sync::mpsc::error::SendError;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::MissedTickBehavior;
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::Compression;
|
||||
use utils::postgres_client::InterpretedFormat;
|
||||
use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
|
||||
use wal_decoder::wire_format::ToWireFormat;
|
||||
|
||||
use crate::send_wal::EndWatchView;
|
||||
use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
|
||||
use crate::metrics::WAL_READERS;
|
||||
use crate::send_wal::{EndWatchView, WalSenderGuard};
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::{StreamingWalReader, WalBytes};
|
||||
|
||||
/// Shard-aware interpreted record sender.
|
||||
/// This is used for sending WAL to the pageserver. Said WAL
|
||||
/// is pre-interpreted and filtered for the shard.
|
||||
pub(crate) struct InterpretedWalSender<'a, IO> {
|
||||
pub(crate) format: InterpretedFormat,
|
||||
pub(crate) compression: Option<Compression>,
|
||||
pub(crate) pgb: &'a mut PostgresBackend<IO>,
|
||||
pub(crate) wal_stream_builder: WalReaderStreamBuilder,
|
||||
pub(crate) end_watch_view: EndWatchView,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) pg_version: u32,
|
||||
pub(crate) appname: Option<String>,
|
||||
/// Identifier used to differentiate between senders of the same
|
||||
/// shard.
|
||||
///
|
||||
/// In the steady state there's only one, but two pageservers may
|
||||
/// temporarily have the same shard attached and attempt to ingest
|
||||
/// WAL for it. See also [`ShardSenderId`].
|
||||
#[derive(Hash, Eq, PartialEq, Copy, Clone)]
|
||||
struct SenderId(u8);
|
||||
|
||||
impl SenderId {
|
||||
fn first() -> Self {
|
||||
SenderId(0)
|
||||
}
|
||||
|
||||
fn next(&self) -> Self {
|
||||
SenderId(self.0.checked_add(1).expect("few senders"))
|
||||
}
|
||||
}
|
||||
|
||||
struct Batch {
|
||||
#[derive(Hash, Eq, PartialEq)]
|
||||
struct ShardSenderId {
|
||||
shard: ShardIdentity,
|
||||
sender_id: SenderId,
|
||||
}
|
||||
|
||||
impl Display for ShardSenderId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug())
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardSenderId {
|
||||
fn new(shard: ShardIdentity, sender_id: SenderId) -> Self {
|
||||
ShardSenderId { shard, sender_id }
|
||||
}
|
||||
|
||||
fn shard(&self) -> ShardIdentity {
|
||||
self.shard
|
||||
}
|
||||
}
|
||||
|
||||
/// Shard-aware fan-out interpreted record reader.
|
||||
/// Reads WAL from disk, decodes it, intepretets it, and sends
|
||||
/// it to any [`InterpretedWalSender`] connected to it.
|
||||
/// Each [`InterpretedWalSender`] corresponds to one shard
|
||||
/// and gets interpreted records concerning that shard only.
|
||||
pub(crate) struct InterpretedWalReader {
|
||||
wal_stream: StreamingWalReader,
|
||||
shard_senders: HashMap<ShardIdentity, smallvec::SmallVec<[ShardSenderState; 1]>>,
|
||||
shard_notification_rx: Option<tokio::sync::mpsc::UnboundedReceiver<AttachShardNotification>>,
|
||||
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
/// A handle for [`InterpretedWalReader`] which allows for interacting with it
|
||||
/// when it runs as a separate tokio task.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct InterpretedWalReaderHandle {
|
||||
join_handle: JoinHandle<Result<(), InterpretedWalReaderError>>,
|
||||
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
|
||||
shard_notification_tx: tokio::sync::mpsc::UnboundedSender<AttachShardNotification>,
|
||||
}
|
||||
|
||||
struct ShardSenderState {
|
||||
sender_id: SenderId,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
next_record_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// State of [`InterpretedWalReader`] visible outside of the task running it.
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum InterpretedWalReaderState {
|
||||
Running { current_position: Lsn },
|
||||
Done,
|
||||
}
|
||||
|
||||
pub(crate) struct Batch {
|
||||
wal_end_lsn: Lsn,
|
||||
available_wal_end_lsn: Lsn,
|
||||
records: InterpretedWalRecords,
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
/// Send interpreted WAL to a receiver.
|
||||
/// Stops when an error occurs or the receiver is caught up and there's no active compute.
|
||||
///
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut wal_position = self.wal_stream_builder.start_pos();
|
||||
let mut wal_decoder =
|
||||
WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum InterpretedWalReaderError {
|
||||
/// Handler initiates the end of streaming.
|
||||
#[error("decode error: {0}")]
|
||||
Decode(#[from] WalDecodeError),
|
||||
#[error("read or interpret error: {0}")]
|
||||
ReadOrInterpret(#[from] anyhow::Error),
|
||||
#[error("wal stream closed")]
|
||||
WalStreamClosed,
|
||||
}
|
||||
|
||||
let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
impl InterpretedWalReaderState {
|
||||
fn current_position(&self) -> Option<Lsn> {
|
||||
match self {
|
||||
InterpretedWalReaderState::Running {
|
||||
current_position, ..
|
||||
} => Some(*current_position),
|
||||
InterpretedWalReaderState::Done => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
|
||||
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
keepalive_ticker.reset();
|
||||
pub(crate) struct AttachShardNotification {
|
||||
shard_id: ShardIdentity,
|
||||
sender: tokio::sync::mpsc::Sender<Batch>,
|
||||
start_pos: Lsn,
|
||||
}
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
|
||||
let shard = vec![self.shard];
|
||||
impl InterpretedWalReader {
|
||||
/// Spawn the reader in a separate tokio task and return a handle
|
||||
pub(crate) fn spawn(
|
||||
wal_stream: StreamingWalReader,
|
||||
start_pos: Lsn,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
appname: &Option<String>,
|
||||
) -> InterpretedWalReaderHandle {
|
||||
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
|
||||
current_position: start_pos,
|
||||
}));
|
||||
|
||||
let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let reader = InterpretedWalReader {
|
||||
wal_stream,
|
||||
shard_senders: HashMap::from([(
|
||||
shard,
|
||||
smallvec::smallvec![ShardSenderState {
|
||||
sender_id: SenderId::first(),
|
||||
tx,
|
||||
next_record_lsn: start_pos,
|
||||
}],
|
||||
)]),
|
||||
shard_notification_rx: Some(shard_notification_rx),
|
||||
state: state.clone(),
|
||||
pg_version,
|
||||
};
|
||||
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")])
|
||||
.unwrap();
|
||||
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = reader.run_impl(start_pos).await;
|
||||
if let Err(ref err) = res {
|
||||
tracing::error!("Task finished with error: {err}");
|
||||
}
|
||||
res
|
||||
}
|
||||
.instrument(info_span!("interpreted wal reader")),
|
||||
);
|
||||
|
||||
InterpretedWalReaderHandle {
|
||||
join_handle,
|
||||
state,
|
||||
shard_notification_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct the reader without spawning anything
|
||||
/// Callers should drive the future returned by [`Self::run`].
|
||||
pub(crate) fn new(
|
||||
wal_stream: StreamingWalReader,
|
||||
start_pos: Lsn,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
) -> InterpretedWalReader {
|
||||
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
|
||||
current_position: start_pos,
|
||||
}));
|
||||
|
||||
InterpretedWalReader {
|
||||
wal_stream,
|
||||
shard_senders: HashMap::from([(
|
||||
shard,
|
||||
smallvec::smallvec![ShardSenderState {
|
||||
sender_id: SenderId::first(),
|
||||
tx,
|
||||
next_record_lsn: start_pos,
|
||||
}],
|
||||
)]),
|
||||
shard_notification_rx: None,
|
||||
state: state.clone(),
|
||||
pg_version,
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry point for future (polling) based wal reader.
|
||||
pub(crate) async fn run(
|
||||
self,
|
||||
start_pos: Lsn,
|
||||
appname: &Option<String>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")])
|
||||
.unwrap();
|
||||
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = self.run_impl(start_pos).await;
|
||||
if let Err(err) = res {
|
||||
tracing::error!("Interpreted wal reader encountered error: {err}");
|
||||
} else {
|
||||
tracing::info!("Interpreted wal reader exiting");
|
||||
}
|
||||
|
||||
Err(CopyStreamHandlerEnd::Other(anyhow!(
|
||||
"interpreted wal reader finished"
|
||||
)))
|
||||
}
|
||||
|
||||
/// Send interpreted WAL to one or more [`InterpretedWalSender`]s
|
||||
/// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`]
|
||||
/// goes out of scope.
|
||||
async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> {
|
||||
let defer_state = self.state.clone();
|
||||
scopeguard::defer! {
|
||||
*defer_state.write().unwrap() = InterpretedWalReaderState::Done;
|
||||
}
|
||||
|
||||
let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Get some WAL from the stream and then: decode, interpret and push it down the
|
||||
// pipeline.
|
||||
wal = stream.next(), if tx.capacity() > 0 => {
|
||||
let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
|
||||
Some(some) => some?,
|
||||
None => { break; }
|
||||
// Main branch for reading WAL and forwarding it
|
||||
wal_or_reset = self.wal_stream.next() => {
|
||||
let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
|
||||
let WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: _,
|
||||
wal_end_lsn,
|
||||
available_wal_end_lsn,
|
||||
} = match wal {
|
||||
Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?,
|
||||
None => {
|
||||
// [`StreamingWalReader::next`] is an endless stream of WAL.
|
||||
// It shouldn't ever finish unless it panicked or became internally
|
||||
// inconsistent.
|
||||
return Result::Err(InterpretedWalReaderError::WalStreamClosed);
|
||||
}
|
||||
};
|
||||
|
||||
wal_position = wal_end_lsn;
|
||||
wal_decoder.feed_bytes(&wal);
|
||||
|
||||
let mut records = Vec::new();
|
||||
// Deserialize and interpret WAL records from this batch of WAL.
|
||||
// Interpreted records for each shard are collected separately.
|
||||
let shard_ids = self.shard_senders.keys().copied().collect::<Vec<_>>();
|
||||
let mut records_by_sender: HashMap<ShardSenderId, Vec<InterpretedWalRecord>> = HashMap::new();
|
||||
let mut max_next_record_lsn = None;
|
||||
while let Some((next_record_lsn, recdata)) = wal_decoder
|
||||
.poll_decode()
|
||||
.with_context(|| "Failed to decode WAL")?
|
||||
while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()?
|
||||
{
|
||||
assert!(next_record_lsn.is_aligned());
|
||||
max_next_record_lsn = Some(next_record_lsn);
|
||||
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard,
|
||||
&shard_ids,
|
||||
next_record_lsn,
|
||||
self.pg_version,
|
||||
)
|
||||
.with_context(|| "Failed to interpret WAL")?
|
||||
.remove(&self.shard)
|
||||
.unwrap();
|
||||
.with_context(|| "Failed to interpret WAL")?;
|
||||
|
||||
if !interpreted.is_empty() {
|
||||
records.push(interpreted);
|
||||
for (shard, record) in interpreted {
|
||||
if record.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut states_iter = self.shard_senders
|
||||
.get(&shard)
|
||||
.expect("keys collected above")
|
||||
.iter()
|
||||
.filter(|state| record.next_record_lsn > state.next_record_lsn)
|
||||
.peekable();
|
||||
while let Some(state) = states_iter.next() {
|
||||
let shard_sender_id = ShardSenderId::new(shard, state.sender_id);
|
||||
|
||||
// The most commont case is one sender per shard. Peek and break to avoid the
|
||||
// clone in that situation.
|
||||
if states_iter.peek().is_none() {
|
||||
records_by_sender.entry(shard_sender_id).or_default().push(record);
|
||||
break;
|
||||
} else {
|
||||
records_by_sender.entry(shard_sender_id).or_default().push(record.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,20 +333,170 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
None => { continue; }
|
||||
};
|
||||
|
||||
let batch = InterpretedWalRecords {
|
||||
records,
|
||||
next_record_lsn: Some(max_next_record_lsn),
|
||||
};
|
||||
// Update the current position such that new receivers can decide
|
||||
// whether to attach to us or spawn a new WAL reader.
|
||||
match &mut *self.state.write().unwrap() {
|
||||
InterpretedWalReaderState::Running { current_position, .. } => {
|
||||
*current_position = max_next_record_lsn;
|
||||
},
|
||||
InterpretedWalReaderState::Done => {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
|
||||
// Send interpreted records downstream. Anything that has already been seen
|
||||
// by a shard is filtered out.
|
||||
let mut shard_senders_to_remove = Vec::new();
|
||||
for (shard, states) in &mut self.shard_senders {
|
||||
for state in states {
|
||||
if max_next_record_lsn <= state.next_record_lsn {
|
||||
continue;
|
||||
}
|
||||
|
||||
let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
|
||||
let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
|
||||
|
||||
let batch = InterpretedWalRecords {
|
||||
records,
|
||||
next_record_lsn: Some(max_next_record_lsn),
|
||||
};
|
||||
|
||||
let res = state.tx.send(Batch {
|
||||
wal_end_lsn,
|
||||
available_wal_end_lsn,
|
||||
records: batch,
|
||||
}).await;
|
||||
|
||||
if res.is_err() {
|
||||
shard_senders_to_remove.push(shard_sender_id);
|
||||
} else {
|
||||
state.next_record_lsn = max_next_record_lsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up any shard senders that have dropped out.
|
||||
// This is inefficient, but such events are rare (connection to PS termination)
|
||||
// and the number of subscriptions on the same shards very small (only one
|
||||
// for the steady state).
|
||||
for to_remove in shard_senders_to_remove {
|
||||
let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above");
|
||||
if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) {
|
||||
shard_senders.remove(idx);
|
||||
tracing::info!("Removed shard sender {}", to_remove);
|
||||
}
|
||||
|
||||
if shard_senders.is_empty() {
|
||||
self.shard_senders.remove(&to_remove.shard());
|
||||
}
|
||||
}
|
||||
},
|
||||
// For a previously interpreted batch, serialize it and push it down the wire.
|
||||
batch = rx.recv() => {
|
||||
// Listen for new shards that want to attach to this reader.
|
||||
// If the reader is not running as a task, then this is not supported
|
||||
// (see the pending branch below).
|
||||
notification = match self.shard_notification_rx.as_mut() {
|
||||
Some(rx) => Either::Left(rx.recv()),
|
||||
None => Either::Right(std::future::pending())
|
||||
} => {
|
||||
if let Some(n) = notification {
|
||||
let AttachShardNotification { shard_id, sender, start_pos } = n;
|
||||
|
||||
// Update internal and external state, then reset the WAL stream
|
||||
// if required.
|
||||
let senders = self.shard_senders.entry(shard_id).or_default();
|
||||
let new_sender_id = match senders.last() {
|
||||
Some(sender) => sender.sender_id.next(),
|
||||
None => SenderId::first()
|
||||
};
|
||||
|
||||
senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
|
||||
let current_pos = self.state.read().unwrap().current_position().unwrap();
|
||||
if start_pos < current_pos {
|
||||
self.wal_stream.reset(start_pos).await;
|
||||
wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Added shard sender {} with start_pos={} current_pos={}",
|
||||
ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InterpretedWalReaderHandle {
|
||||
/// Fan-out the reader by attaching a new shard to it
|
||||
pub(crate) fn fanout(
|
||||
&self,
|
||||
shard_id: ShardIdentity,
|
||||
sender: tokio::sync::mpsc::Sender<Batch>,
|
||||
start_pos: Lsn,
|
||||
) -> Result<(), SendError<AttachShardNotification>> {
|
||||
self.shard_notification_tx.send(AttachShardNotification {
|
||||
shard_id,
|
||||
sender,
|
||||
start_pos,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the current WAL position of the reader
|
||||
pub(crate) fn current_position(&self) -> Option<Lsn> {
|
||||
self.state.read().unwrap().current_position()
|
||||
}
|
||||
|
||||
pub(crate) fn abort(&self) {
|
||||
self.join_handle.abort()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for InterpretedWalReaderHandle {
|
||||
fn drop(&mut self) {
|
||||
tracing::info!("Aborting interpreted wal reader");
|
||||
self.abort()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct InterpretedWalSender<'a, IO> {
|
||||
pub(crate) format: InterpretedFormat,
|
||||
pub(crate) compression: Option<Compression>,
|
||||
pub(crate) appname: Option<String>,
|
||||
|
||||
pub(crate) tli: WalResidentTimeline,
|
||||
pub(crate) start_lsn: Lsn,
|
||||
|
||||
pub(crate) pgb: &'a mut PostgresBackend<IO>,
|
||||
pub(crate) end_watch_view: EndWatchView,
|
||||
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
|
||||
pub(crate) rx: tokio::sync::mpsc::Receiver<Batch>,
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
/// Send interpreted WAL records over the network.
|
||||
/// Also manages keep-alives if nothing was sent for a while.
|
||||
pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
|
||||
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
keepalive_ticker.reset();
|
||||
|
||||
let mut wal_position = self.start_lsn;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
batch = self.rx.recv() => {
|
||||
let batch = match batch {
|
||||
Some(b) => b,
|
||||
None => { break; }
|
||||
None => {
|
||||
return Result::Err(
|
||||
CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early"))
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
wal_position = batch.wal_end_lsn;
|
||||
|
||||
let buf = batch
|
||||
.records
|
||||
.to_wire(self.format, self.compression)
|
||||
@@ -136,7 +516,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
})).await?;
|
||||
}
|
||||
// Send a periodic keep alive when the connection has been idle for a while.
|
||||
// Since we've been idle, also check if we can stop streaming.
|
||||
_ = keepalive_ticker.tick() => {
|
||||
if let Some(remote_consistent_lsn) = self.wal_sender_guard
|
||||
.walsenders()
|
||||
.get_ws_remote_consistent_lsn(self.wal_sender_guard.id())
|
||||
{
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Stop streaming if the receivers are caught up and
|
||||
// there's no active compute. This causes the loop in
|
||||
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
|
||||
// to exit and terminate the WAL stream.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
self.pgb
|
||||
.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
|
||||
wal_end: self.end_watch_view.get().0,
|
||||
@@ -144,14 +538,259 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
request_reply: true,
|
||||
}))
|
||||
.await?;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// The loop above ends when the receiver is caught up and there's no more WAL to send.
|
||||
Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
|
||||
self.appname, wal_position,
|
||||
)))
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use tokio::sync::mpsc::error::TryRecvError;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
send_interpreted_wal::{Batch, InterpretedWalReader},
|
||||
test_utils::Env,
|
||||
wal_reader_stream::StreamingWalReader,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_interpreted_wal_reader_fanout() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
tracing::info!("Doing first round of reads ...");
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let shard_0 = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let shard_1 = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut shards = HashMap::new();
|
||||
|
||||
for shard_number in 0..SHARD_COUNT {
|
||||
let shard_id = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
shards.insert(shard_id, (Some(tx), Some(rx)));
|
||||
}
|
||||
|
||||
let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap();
|
||||
let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap();
|
||||
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
shard_0_tx,
|
||||
shard_0,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
tracing::info!("Reading all WAL with only shard 0 attached ...");
|
||||
|
||||
let mut shard_0_interpreted_records = Vec::new();
|
||||
while let Some(batch) = shard_0_rx.recv().await {
|
||||
shard_0_interpreted_records.push(batch.records);
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap();
|
||||
let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap();
|
||||
|
||||
tracing::info!("Attaching shard 1 to the reader at start of WAL");
|
||||
handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap();
|
||||
|
||||
tracing::info!("Reading all WAL with shard 0 and shard 1 attached ...");
|
||||
|
||||
let mut shard_1_interpreted_records = Vec::new();
|
||||
while let Some(batch) = shard_1_rx.recv().await {
|
||||
shard_1_interpreted_records.push(batch.records);
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// This test uses logical messages. Those only go to shard 0. Check that the
|
||||
// filtering worked and shard 1 did not get any.
|
||||
assert!(shard_1_interpreted_records
|
||||
.iter()
|
||||
.all(|recs| recs.records.is_empty()));
|
||||
|
||||
// Shard 0 should not receive anything more since the reader is
|
||||
// going through wal that it has already processed.
|
||||
let res = shard_0_rx.try_recv();
|
||||
if let Ok(ref ok) = res {
|
||||
tracing::error!(
|
||||
"Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}",
|
||||
ok.wal_end_lsn,
|
||||
ok.available_wal_end_lsn
|
||||
);
|
||||
}
|
||||
assert!(matches!(res, Err(TryRecvError::Empty)));
|
||||
|
||||
// Check that the next records lsns received by the two shards match up.
|
||||
let shard_0_next_lsns = shard_0_interpreted_records
|
||||
.iter()
|
||||
.map(|recs| recs.next_record_lsn)
|
||||
.collect::<Vec<_>>();
|
||||
let shard_1_next_lsns = shard_1_interpreted_records
|
||||
.iter()
|
||||
.map(|recs| recs.next_record_lsn)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(shard_0_next_lsns, shard_1_next_lsns);
|
||||
|
||||
handle.abort();
|
||||
let mut done = false;
|
||||
for _ in 0..5 {
|
||||
if handle.current_position().is_none() {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_interpreted_wal_reader_same_shard_fanout() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
const ATTACHED_SHARDS: u8 = 4;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let shard_0 = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
let mut batch_receivers = vec![rx];
|
||||
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
tx,
|
||||
shard_0,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
for _ in 0..(ATTACHED_SHARDS - 1) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
handle.fanout(shard_0, tx, start_lsn).unwrap();
|
||||
batch_receivers.push(rx);
|
||||
}
|
||||
|
||||
loop {
|
||||
let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
|
||||
for rx in batch_receivers.iter_mut().skip(1) {
|
||||
let other_batch = rx.recv().await.unwrap();
|
||||
|
||||
assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
|
||||
assert_eq!(
|
||||
batch.available_wal_end_lsn,
|
||||
other_batch.available_wal_end_lsn
|
||||
);
|
||||
}
|
||||
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
handle.abort();
|
||||
let mut done = false;
|
||||
for _ in 0..5 {
|
||||
if handle.current_position().is_none() {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,16 +2,18 @@
|
||||
//! with the "START_REPLICATION" message, and registry of walsenders.
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::RECEIVED_PS_FEEDBACKS;
|
||||
use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS};
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::TermLsn;
|
||||
use crate::send_interpreted_wal::InterpretedWalSender;
|
||||
use crate::send_interpreted_wal::{
|
||||
Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender,
|
||||
};
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::WalReaderStreamBuilder;
|
||||
use crate::wal_reader_stream::StreamingWalReader;
|
||||
use crate::wal_storage::WalReader;
|
||||
use anyhow::{bail, Context as AnyhowContext};
|
||||
use bytes::Bytes;
|
||||
use futures::future::Either;
|
||||
use futures::FutureExt;
|
||||
use parking_lot::Mutex;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
|
||||
@@ -19,16 +21,16 @@ use postgres_ffi::get_current_timestamp;
|
||||
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
||||
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
||||
use safekeeper_api::models::{
|
||||
ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
|
||||
WalSenderState, INVALID_FULL_TRANSACTION_ID,
|
||||
HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
|
||||
INVALID_FULL_TRANSACTION_ID,
|
||||
};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use utils::failpoint_support;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use itertools::Itertools;
|
||||
use std::cmp::{max, min};
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
@@ -50,6 +52,12 @@ pub struct WalSenders {
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
}
|
||||
|
||||
pub struct WalSendersTimelineMetricValues {
|
||||
pub ps_feedback_counter: u64,
|
||||
pub last_ps_feedback: PageserverFeedback,
|
||||
pub interpreted_wal_reader_tasks: usize,
|
||||
}
|
||||
|
||||
impl WalSenders {
|
||||
pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
|
||||
Arc::new(WalSenders {
|
||||
@@ -60,21 +68,8 @@ impl WalSenders {
|
||||
|
||||
/// Register new walsender. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
fn register(
|
||||
self: &Arc<WalSenders>,
|
||||
ttid: TenantTimelineId,
|
||||
addr: SocketAddr,
|
||||
conn_id: ConnectionId,
|
||||
appname: Option<String>,
|
||||
) -> WalSenderGuard {
|
||||
fn register(self: &Arc<WalSenders>, walsender_state: WalSenderState) -> WalSenderGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walsender_state = WalSenderState {
|
||||
ttid,
|
||||
addr,
|
||||
conn_id,
|
||||
appname,
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
};
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walsender_state);
|
||||
@@ -90,9 +85,79 @@ impl WalSenders {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_or_update_interpreted_reader<
|
||||
FUp: FnOnce(&Arc<InterpretedWalReaderHandle>) -> anyhow::Result<()>,
|
||||
FNew: FnOnce() -> InterpretedWalReaderHandle,
|
||||
>(
|
||||
self: &Arc<WalSenders>,
|
||||
id: WalSenderId,
|
||||
start_pos: Lsn,
|
||||
max_delta_for_fanout: Option<u64>,
|
||||
update: FUp,
|
||||
create: FNew,
|
||||
) -> anyhow::Result<()> {
|
||||
let state = &mut self.mutex.lock();
|
||||
|
||||
let mut selected_interpreted_reader = None;
|
||||
for slot in state.slots.iter().flatten() {
|
||||
if let WalSenderState::Interpreted(slot_state) = slot {
|
||||
if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader {
|
||||
let select = match (interpreted_reader.current_position(), max_delta_for_fanout)
|
||||
{
|
||||
(Some(pos), Some(max_delta)) => {
|
||||
let delta = pos.0.abs_diff(start_pos.0);
|
||||
delta <= max_delta
|
||||
}
|
||||
// Reader is not active
|
||||
(None, _) => false,
|
||||
// Gating fanout by max delta is disabled.
|
||||
// Attach to any active reader.
|
||||
(_, None) => true,
|
||||
};
|
||||
|
||||
if select {
|
||||
selected_interpreted_reader = Some(interpreted_reader.clone());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let slot = state.get_slot_mut(id);
|
||||
let slot_state = match slot {
|
||||
WalSenderState::Interpreted(s) => s,
|
||||
WalSenderState::Vanilla(_) => unreachable!(),
|
||||
};
|
||||
|
||||
let selected_or_new = match selected_interpreted_reader {
|
||||
Some(selected) => {
|
||||
update(&selected)?;
|
||||
selected
|
||||
}
|
||||
None => Arc::new(create()),
|
||||
};
|
||||
|
||||
slot_state.interpreted_wal_reader = Some(selected_or_new);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get state of all walsenders.
|
||||
pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
pub fn get_all_public(self: &Arc<WalSenders>) -> Vec<safekeeper_api::models::WalSenderState> {
|
||||
self.mutex
|
||||
.lock()
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|state| match state {
|
||||
WalSenderState::Vanilla(s) => {
|
||||
safekeeper_api::models::WalSenderState::Vanilla(s.clone())
|
||||
}
|
||||
WalSenderState::Interpreted(s) => {
|
||||
safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone())
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get LSN of the most lagging pageserver receiver. Return None if there are no
|
||||
@@ -103,7 +168,7 @@ impl WalSenders {
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.filter_map(|s| match s.feedback {
|
||||
.filter_map(|s| match s.get_feedback() {
|
||||
ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
|
||||
ReplicationFeedback::Standby(_) => None,
|
||||
})
|
||||
@@ -111,9 +176,25 @@ impl WalSenders {
|
||||
}
|
||||
|
||||
/// Returns total counter of pageserver feedbacks received and last feedback.
|
||||
pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
|
||||
pub fn info_for_metrics(self: &Arc<WalSenders>) -> WalSendersTimelineMetricValues {
|
||||
let shared = self.mutex.lock();
|
||||
(shared.ps_feedback_counter, shared.last_ps_feedback)
|
||||
|
||||
let interpreted_wal_reader_tasks = shared
|
||||
.slots
|
||||
.iter()
|
||||
.filter_map(|ss| match ss {
|
||||
Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(),
|
||||
Some(WalSenderState::Vanilla(_)) => None,
|
||||
None => None,
|
||||
})
|
||||
.unique_by(|reader| Arc::as_ptr(reader))
|
||||
.count();
|
||||
|
||||
WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter: shared.ps_feedback_counter,
|
||||
last_ps_feedback: shared.last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get aggregated hot standby feedback (we send it to compute).
|
||||
@@ -124,7 +205,7 @@ impl WalSenders {
|
||||
/// Record new pageserver feedback, update aggregated values.
|
||||
fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
|
||||
*shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback);
|
||||
shared.last_ps_feedback = *feedback;
|
||||
shared.ps_feedback_counter += 1;
|
||||
drop(shared);
|
||||
@@ -143,10 +224,10 @@ impl WalSenders {
|
||||
"Record standby reply: ts={} apply_lsn={}",
|
||||
reply.reply_ts, reply.apply_lsn
|
||||
);
|
||||
match &mut slot.feedback {
|
||||
match &mut slot.get_mut_feedback() {
|
||||
ReplicationFeedback::Standby(sf) => sf.reply = *reply,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: *reply,
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
})
|
||||
@@ -158,10 +239,10 @@ impl WalSenders {
|
||||
fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
let slot = shared.get_slot_mut(id);
|
||||
match &mut slot.feedback {
|
||||
match &mut slot.get_mut_feedback() {
|
||||
ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: StandbyReply::empty(),
|
||||
hs_feedback: *feedback,
|
||||
})
|
||||
@@ -175,7 +256,7 @@ impl WalSenders {
|
||||
pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
|
||||
let shared = self.mutex.lock();
|
||||
let slot = shared.get_slot(id);
|
||||
match slot.feedback {
|
||||
match slot.get_feedback() {
|
||||
ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
|
||||
_ => None,
|
||||
}
|
||||
@@ -199,6 +280,47 @@ struct WalSendersShared {
|
||||
slots: Vec<Option<WalSenderState>>,
|
||||
}
|
||||
|
||||
/// Safekeeper internal definitions of wal sender state
|
||||
///
|
||||
/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may
|
||||
/// include state that we don not wish to expose to the public api.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum WalSenderState {
|
||||
Vanilla(VanillaWalSenderInternalState),
|
||||
Interpreted(InterpretedWalSenderInternalState),
|
||||
}
|
||||
|
||||
type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct InterpretedWalSenderInternalState {
|
||||
public_state: safekeeper_api::models::InterpretedWalSenderState,
|
||||
interpreted_wal_reader: Option<Arc<InterpretedWalReaderHandle>>,
|
||||
}
|
||||
|
||||
impl WalSenderState {
|
||||
fn get_addr(&self) -> &SocketAddr {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &state.addr,
|
||||
WalSenderState::Interpreted(state) => &state.public_state.addr,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_feedback(&self) -> &ReplicationFeedback {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &state.feedback,
|
||||
WalSenderState::Interpreted(state) => &state.public_state.feedback,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &mut state.feedback,
|
||||
WalSenderState::Interpreted(state) => &mut state.public_state.feedback,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WalSendersShared {
|
||||
fn new() -> Self {
|
||||
WalSendersShared {
|
||||
@@ -225,7 +347,7 @@ impl WalSendersShared {
|
||||
let mut agg = HotStandbyFeedback::empty();
|
||||
let mut reply_agg = StandbyReply::empty();
|
||||
for ws_state in self.slots.iter().flatten() {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() {
|
||||
let hs_feedback = standby_feedback.hs_feedback;
|
||||
// doing Option math like op1.iter().chain(op2.iter()).min()
|
||||
// would be nicer, but we serialize/deserialize this struct
|
||||
@@ -317,7 +439,7 @@ impl SafekeeperPostgresHandler {
|
||||
/// Wrapper around handle_start_replication_guts handling result. Error is
|
||||
/// handled here while we're still in walsender ttid span; with API
|
||||
/// extension, this can probably be moved into postgres_backend.
|
||||
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
@@ -342,7 +464,7 @@ impl SafekeeperPostgresHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
@@ -352,12 +474,30 @@ impl SafekeeperPostgresHandler {
|
||||
let appname = self.appname.clone();
|
||||
|
||||
// Use a guard object to remove our entry from the timeline when we are done.
|
||||
let ws_guard = Arc::new(tli.get_walsenders().register(
|
||||
self.ttid,
|
||||
*pgb.get_peer_addr(),
|
||||
self.conn_id,
|
||||
self.appname.clone(),
|
||||
));
|
||||
let ws_guard = match self.protocol() {
|
||||
PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register(
|
||||
WalSenderState::Vanilla(VanillaWalSenderInternalState {
|
||||
ttid: self.ttid,
|
||||
addr: *pgb.get_peer_addr(),
|
||||
conn_id: self.conn_id,
|
||||
appname: self.appname.clone(),
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
}),
|
||||
)),
|
||||
PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register(
|
||||
WalSenderState::Interpreted(InterpretedWalSenderInternalState {
|
||||
public_state: safekeeper_api::models::InterpretedWalSenderState {
|
||||
ttid: self.ttid,
|
||||
shard: self.shard.unwrap(),
|
||||
addr: *pgb.get_peer_addr(),
|
||||
conn_id: self.conn_id,
|
||||
appname: self.appname.clone(),
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
},
|
||||
interpreted_wal_reader: None,
|
||||
}),
|
||||
)),
|
||||
};
|
||||
|
||||
// Walsender can operate in one of two modes which we select by
|
||||
// application_name: give only committed WAL (used by pageserver) or all
|
||||
@@ -403,7 +543,7 @@ impl SafekeeperPostgresHandler {
|
||||
pgb,
|
||||
// should succeed since we're already holding another guard
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
appname,
|
||||
appname: appname.clone(),
|
||||
start_pos,
|
||||
end_pos,
|
||||
term,
|
||||
@@ -413,7 +553,7 @@ impl SafekeeperPostgresHandler {
|
||||
send_buf: vec![0u8; MAX_SEND_SIZE],
|
||||
};
|
||||
|
||||
Either::Left(sender.run())
|
||||
FutureExt::boxed(sender.run())
|
||||
}
|
||||
PostgresClientProtocol::Interpreted {
|
||||
format,
|
||||
@@ -421,27 +561,96 @@ impl SafekeeperPostgresHandler {
|
||||
} => {
|
||||
let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
|
||||
let end_watch_view = end_watch.view();
|
||||
let wal_stream_builder = WalReaderStreamBuilder {
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_pos,
|
||||
end_pos,
|
||||
term,
|
||||
end_watch,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
};
|
||||
let wal_residence_guard = tli.wal_residence_guard().await?;
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(2);
|
||||
let shard = self.shard.unwrap();
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
pgb,
|
||||
wal_stream_builder,
|
||||
end_watch_view,
|
||||
shard: self.shard.unwrap(),
|
||||
pg_version,
|
||||
appname,
|
||||
};
|
||||
if self.conf.wal_reader_fanout && !shard.is_unsharded() {
|
||||
let ws_id = ws_guard.id();
|
||||
ws_guard.walsenders().create_or_update_interpreted_reader(
|
||||
ws_id,
|
||||
start_pos,
|
||||
self.conf.max_delta_for_fanout,
|
||||
{
|
||||
let tx = tx.clone();
|
||||
|reader| {
|
||||
tracing::info!(
|
||||
"Fanning out interpreted wal reader at {}",
|
||||
start_pos
|
||||
);
|
||||
reader
|
||||
.fanout(shard, tx, start_pos)
|
||||
.with_context(|| "Failed to fan out reader")
|
||||
}
|
||||
},
|
||||
|| {
|
||||
tracing::info!("Spawning interpreted wal reader at {}", start_pos);
|
||||
|
||||
Either::Right(sender.run())
|
||||
let wal_stream = StreamingWalReader::new(
|
||||
wal_residence_guard,
|
||||
term,
|
||||
start_pos,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
InterpretedWalReader::spawn(
|
||||
wal_stream, start_pos, tx, shard, pg_version, &appname,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
appname,
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_lsn: start_pos,
|
||||
pgb,
|
||||
end_watch_view,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
rx,
|
||||
};
|
||||
|
||||
FutureExt::boxed(sender.run())
|
||||
} else {
|
||||
let wal_reader = StreamingWalReader::new(
|
||||
wal_residence_guard,
|
||||
term,
|
||||
start_pos,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let reader =
|
||||
InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version);
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
appname: appname.clone(),
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_lsn: start_pos,
|
||||
pgb,
|
||||
end_watch_view,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
rx,
|
||||
};
|
||||
|
||||
FutureExt::boxed(async move {
|
||||
// Sender returns an Err on all code paths.
|
||||
// If the sender finishes first, we will drop the reader future.
|
||||
// If the reader finishes first, the sender will finish too since
|
||||
// the wal sender has dropped.
|
||||
let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname));
|
||||
match res.map(|_| ()) {
|
||||
Ok(_) => unreachable!("sender finishes with Err by convention"),
|
||||
err_res => err_res,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -470,7 +679,8 @@ impl SafekeeperPostgresHandler {
|
||||
.clone();
|
||||
info!(
|
||||
"finished streaming to {}, feedback={:?}",
|
||||
ws_state.addr, ws_state.feedback,
|
||||
ws_state.get_addr(),
|
||||
ws_state.get_feedback(),
|
||||
);
|
||||
|
||||
// Join pg backend back.
|
||||
@@ -578,6 +788,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&[
|
||||
"future",
|
||||
self.appname.as_deref().unwrap_or("safekeeper"),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
// Wait for the next portion if it is not there yet, or just
|
||||
// update our end of WAL available for sending value, we
|
||||
@@ -813,7 +1035,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use safekeeper_api::models::FullTransactionId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -830,13 +1052,13 @@ mod tests {
|
||||
|
||||
// add to wss specified feedback setting other fields to dummy values
|
||||
fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
|
||||
let walsender_state = WalSenderState {
|
||||
let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState {
|
||||
ttid: mock_ttid(),
|
||||
addr: mock_addr(),
|
||||
conn_id: 1,
|
||||
appname: None,
|
||||
feedback,
|
||||
};
|
||||
});
|
||||
wss.slots.push(Some(walsender_state))
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,19 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
|
||||
use crate::receive_wal::WalAcceptor;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
|
||||
ProposerElected, SafeKeeper, TermHistory,
|
||||
};
|
||||
use crate::send_wal::EndWatch;
|
||||
use crate::state::{TimelinePersistentState, TimelineState};
|
||||
use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::remote_timeline_path;
|
||||
use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
|
||||
use tokio::fs::create_dir_all;
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -107,4 +113,59 @@ impl Env {
|
||||
);
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
// This will be dead code when building a non-benchmark target with the
|
||||
// benchmarking feature enabled.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn write_wal(
|
||||
tli: Arc<Timeline>,
|
||||
start_lsn: Lsn,
|
||||
msg_size: usize,
|
||||
msg_count: usize,
|
||||
) -> anyhow::Result<EndWatch> {
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
|
||||
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
|
||||
|
||||
let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx());
|
||||
|
||||
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
|
||||
|
||||
let prefix = c"p";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(msg_size >= prefixlen);
|
||||
let message = vec![0; msg_size - prefixlen];
|
||||
|
||||
let walgen =
|
||||
&mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
|
||||
for _ in 0..msg_count {
|
||||
let (lsn, record) = walgen.next().unwrap();
|
||||
|
||||
let req = AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: 1,
|
||||
term_start_lsn: start_lsn,
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: lsn,
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
};
|
||||
|
||||
let end_lsn = req.h.end_lsn;
|
||||
|
||||
let msg = ProposerAcceptorMessage::AppendRequest(req);
|
||||
msg_tx.send(msg).await?;
|
||||
while let Some(reply) = reply_rx.recv().await {
|
||||
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
|
||||
if resp.flush_lsn >= end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(end_watch)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ use crate::control_file;
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
|
||||
use crate::send_wal::WalSenders;
|
||||
use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues};
|
||||
use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
|
||||
use crate::timeline_guard::ResidenceGuard;
|
||||
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
||||
@@ -712,16 +712,22 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
|
||||
let WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter,
|
||||
last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
} = self.walsenders.info_for_metrics();
|
||||
|
||||
let state = self.read_shared_state().await;
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
ps_feedback_count,
|
||||
ps_feedback_count: ps_feedback_counter,
|
||||
last_ps_feedback,
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
|
||||
interpreted_wal_reader_tasks,
|
||||
epoch_start_lsn: state.sk.term_start_lsn(),
|
||||
mem_state: state.sk.state().inmem.clone(),
|
||||
persisted_state: TimelinePersistentState::clone(state.sk.state()),
|
||||
@@ -740,7 +746,7 @@ impl Timeline {
|
||||
debug_dump::Memory {
|
||||
is_cancelled: self.is_cancelled(),
|
||||
peers_info_len: state.peers_info.0.len(),
|
||||
walsenders: self.walsenders.get_all(),
|
||||
walsenders: self.walsenders.get_all_public(),
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
|
||||
@@ -1,34 +1,16 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_stream::try_stream;
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
use postgres_backend::CopyStreamHandlerEnd;
|
||||
use safekeeper_api::Term;
|
||||
use std::time::Duration;
|
||||
use tokio::time::timeout;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
send_wal::{EndWatch, WalSenderGuard},
|
||||
timeline::WalResidentTimeline,
|
||||
use std::{
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
|
||||
pub(crate) struct WalReaderStreamBuilder {
|
||||
pub(crate) tli: WalResidentTimeline,
|
||||
pub(crate) start_pos: Lsn,
|
||||
pub(crate) end_pos: Lsn,
|
||||
pub(crate) term: Option<Term>,
|
||||
pub(crate) end_watch: EndWatch,
|
||||
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
|
||||
}
|
||||
use bytes::Bytes;
|
||||
use futures::{stream::BoxStream, Stream, StreamExt};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl WalReaderStreamBuilder {
|
||||
pub(crate) fn start_pos(&self) -> Lsn {
|
||||
self.start_pos
|
||||
}
|
||||
}
|
||||
use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader};
|
||||
use safekeeper_api::Term;
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub(crate) struct WalBytes {
|
||||
/// Raw PG WAL
|
||||
pub(crate) wal: Bytes,
|
||||
@@ -44,106 +26,270 @@ pub(crate) struct WalBytes {
|
||||
pub(crate) available_wal_end_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl WalReaderStreamBuilder {
|
||||
/// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
|
||||
/// The stream terminates when the receiver (pageserver) is fully caught up
|
||||
/// and there's no active computes.
|
||||
pub(crate) async fn build(
|
||||
self,
|
||||
buffer_size: usize,
|
||||
) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
|
||||
// TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
|
||||
// We can make the raw WAL sender use this stream too and remove the duplication.
|
||||
let Self {
|
||||
tli,
|
||||
mut start_pos,
|
||||
mut end_pos,
|
||||
term,
|
||||
mut end_watch,
|
||||
wal_sender_guard,
|
||||
} = self;
|
||||
let mut wal_reader = tli.get_walreader(start_pos).await?;
|
||||
let mut buffer = vec![0; buffer_size];
|
||||
struct PositionedWalReader {
|
||||
start: Lsn,
|
||||
end: Lsn,
|
||||
reader: Option<WalReader>,
|
||||
}
|
||||
|
||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
/// A streaming WAL reader wrapper which can be reset while running
|
||||
pub(crate) struct StreamingWalReader {
|
||||
stream: BoxStream<'static, WalOrReset>,
|
||||
start_changed_tx: tokio::sync::watch::Sender<Lsn>,
|
||||
}
|
||||
|
||||
Ok(try_stream! {
|
||||
loop {
|
||||
let have_something_to_send = end_pos > start_pos;
|
||||
pub(crate) enum WalOrReset {
|
||||
Wal(anyhow::Result<WalBytes>),
|
||||
Reset(Lsn),
|
||||
}
|
||||
|
||||
if !have_something_to_send {
|
||||
// wait for lsn
|
||||
let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
|
||||
match res {
|
||||
Ok(ok) => {
|
||||
end_pos = ok?;
|
||||
},
|
||||
Err(_) => {
|
||||
if let EndWatch::Commit(_) = end_watch {
|
||||
if let Some(remote_consistent_lsn) = wal_sender_guard
|
||||
.walsenders()
|
||||
.get_ws_remote_consistent_lsn(wal_sender_guard.id())
|
||||
{
|
||||
if tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Stop streaming if the receivers are caught up and
|
||||
// there's no active compute. This causes the loop in
|
||||
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
|
||||
// to exit and terminate the WAL stream.
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
assert!(
|
||||
end_pos > start_pos,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
|
||||
// try to send as much as available, capped by the buffer size
|
||||
let mut chunk_end_pos = start_pos + buffer_size as u64;
|
||||
// if we went behind available WAL, back off
|
||||
if chunk_end_pos >= end_pos {
|
||||
chunk_end_pos = end_pos;
|
||||
} else {
|
||||
// If sending not up to end pos, round down to page boundary to
|
||||
// avoid breaking WAL record not at page boundary, as protocol
|
||||
// demands. See walsender.c (XLogSendPhysical).
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
|
||||
let buffer = &mut buffer[..send_size];
|
||||
let send_size: usize;
|
||||
{
|
||||
// If uncommitted part is being pulled, check that the term is
|
||||
// still the expected one.
|
||||
let _term_guard = if let Some(t) = term {
|
||||
Some(tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// Read WAL into buffer. send_size can be additionally capped to
|
||||
// segment boundary here.
|
||||
send_size = wal_reader.read(buffer).await?
|
||||
};
|
||||
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
|
||||
|
||||
yield WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: start_pos,
|
||||
wal_end_lsn: start_pos + send_size as u64,
|
||||
available_wal_end_lsn: end_pos
|
||||
};
|
||||
|
||||
start_pos += send_size as u64;
|
||||
}
|
||||
})
|
||||
impl WalOrReset {
|
||||
pub(crate) fn get_wal(self) -> Option<anyhow::Result<WalBytes>> {
|
||||
match self {
|
||||
WalOrReset::Wal(wal) => Some(wal),
|
||||
WalOrReset::Reset(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StreamingWalReader {
|
||||
pub(crate) fn new(
|
||||
tli: WalResidentTimeline,
|
||||
term: Option<Term>,
|
||||
start: Lsn,
|
||||
end: Lsn,
|
||||
end_watch: EndWatch,
|
||||
buffer_size: usize,
|
||||
) -> Self {
|
||||
let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start);
|
||||
|
||||
let state = WalReaderStreamState {
|
||||
tli,
|
||||
wal_reader: PositionedWalReader {
|
||||
start,
|
||||
end,
|
||||
reader: None,
|
||||
},
|
||||
term,
|
||||
end_watch,
|
||||
buffer: vec![0; buffer_size],
|
||||
buffer_size,
|
||||
};
|
||||
|
||||
// When a change notification is received while polling the internal
|
||||
// reader, stop polling the read future and service the change.
|
||||
let stream = futures::stream::unfold(
|
||||
(state, start_changed_rx),
|
||||
|(mut state, mut rx)| async move {
|
||||
let wal_or_reset = tokio::select! {
|
||||
read_res = state.read() => { WalOrReset::Wal(read_res) },
|
||||
changed_res = rx.changed() => {
|
||||
if changed_res.is_err() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let new_start_pos = rx.borrow_and_update();
|
||||
WalOrReset::Reset(*new_start_pos)
|
||||
}
|
||||
};
|
||||
|
||||
if let WalOrReset::Reset(lsn) = wal_or_reset {
|
||||
state.wal_reader.start = lsn;
|
||||
state.wal_reader.reader = None;
|
||||
}
|
||||
|
||||
Some((wal_or_reset, (state, rx)))
|
||||
},
|
||||
)
|
||||
.boxed();
|
||||
|
||||
Self {
|
||||
stream,
|
||||
start_changed_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the stream to a given position.
|
||||
pub(crate) async fn reset(&mut self, start: Lsn) {
|
||||
self.start_changed_tx.send(start).unwrap();
|
||||
while let Some(wal_or_reset) = self.stream.next().await {
|
||||
match wal_or_reset {
|
||||
WalOrReset::Reset(at) => {
|
||||
// Stream confirmed the reset.
|
||||
// There may only one ongoing reset at any given time,
|
||||
// hence the assertion.
|
||||
assert_eq!(at, start);
|
||||
break;
|
||||
}
|
||||
WalOrReset::Wal(_) => {
|
||||
// Ignore wal generated before reset was handled
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for StreamingWalReader {
|
||||
type Item = WalOrReset;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
Pin::new(&mut self.stream).poll_next(cx)
|
||||
}
|
||||
}
|
||||
|
||||
struct WalReaderStreamState {
|
||||
tli: WalResidentTimeline,
|
||||
wal_reader: PositionedWalReader,
|
||||
term: Option<Term>,
|
||||
end_watch: EndWatch,
|
||||
buffer: Vec<u8>,
|
||||
buffer_size: usize,
|
||||
}
|
||||
|
||||
impl WalReaderStreamState {
|
||||
async fn read(&mut self) -> anyhow::Result<WalBytes> {
|
||||
// Create reader if needed
|
||||
if self.wal_reader.reader.is_none() {
|
||||
self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?);
|
||||
}
|
||||
|
||||
let have_something_to_send = self.wal_reader.end > self.wal_reader.start;
|
||||
if !have_something_to_send {
|
||||
tracing::debug!(
|
||||
"Waiting for wal: start={}, end={}",
|
||||
self.wal_reader.end,
|
||||
self.wal_reader.start
|
||||
);
|
||||
self.wal_reader.end = self
|
||||
.end_watch
|
||||
.wait_for_lsn(self.wal_reader.start, self.term)
|
||||
.await?;
|
||||
tracing::debug!(
|
||||
"Done waiting for wal: start={}, end={}",
|
||||
self.wal_reader.end,
|
||||
self.wal_reader.start
|
||||
);
|
||||
}
|
||||
|
||||
assert!(
|
||||
self.wal_reader.end > self.wal_reader.start,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
|
||||
// Calculate chunk size
|
||||
let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64;
|
||||
if chunk_end_pos >= self.wal_reader.end {
|
||||
chunk_end_pos = self.wal_reader.end;
|
||||
} else {
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize;
|
||||
let buffer = &mut self.buffer[..send_size];
|
||||
|
||||
// Read WAL
|
||||
let send_size = {
|
||||
let _term_guard = if let Some(t) = self.term {
|
||||
Some(self.tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
self.wal_reader
|
||||
.reader
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.read(buffer)
|
||||
.await?
|
||||
};
|
||||
|
||||
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
|
||||
let result = WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: self.wal_reader.start,
|
||||
wal_end_lsn: self.wal_reader.start + send_size as u64,
|
||||
available_wal_end_lsn: self.wal_reader.end,
|
||||
};
|
||||
|
||||
self.wal_reader.start += send_size as u64;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use futures::StreamExt;
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_streaming_wal_reader_reset() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
tracing::info!("Doing first round of reads ...");
|
||||
|
||||
let mut streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let mut before_reset = Vec::new();
|
||||
while let Some(wor) = streaming_wal_reader.next().await {
|
||||
let wal = wor.get_wal().unwrap().unwrap();
|
||||
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
|
||||
before_reset.push(wal);
|
||||
|
||||
if stop {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Resetting the WAL stream ...");
|
||||
|
||||
streaming_wal_reader.reset(start_lsn).await;
|
||||
|
||||
tracing::info!("Doing second round of reads ...");
|
||||
|
||||
let mut after_reset = Vec::new();
|
||||
while let Some(wor) = streaming_wal_reader.next().await {
|
||||
let wal = wor.get_wal().unwrap().unwrap();
|
||||
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
|
||||
after_reset.push(wal);
|
||||
|
||||
if stop {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(before_reset, after_reset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,6 +180,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
|
||||
control_file_save_interval: Duration::from_secs(1),
|
||||
partial_backup_concurrency: 1,
|
||||
eviction_min_resident: Duration::ZERO,
|
||||
wal_reader_fanout: false,
|
||||
max_delta_for_fanout: None,
|
||||
};
|
||||
|
||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled';
|
||||
UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause';
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause';
|
||||
UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled';
|
||||
@@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::controller_api::{
|
||||
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
|
||||
@@ -1305,6 +1305,35 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Sets the scheduling policy of the specified safekeeper
|
||||
async fn handle_safekeeper_scheduling_policy(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let body = json_request::<SafekeeperSchedulingPolicyRequest>(&mut req).await?;
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
state
|
||||
.service
|
||||
.set_safekeeper_scheduling_policy(id, body.scheduling_policy)
|
||||
.await?;
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::empty())
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
@@ -1873,7 +1902,18 @@ pub fn make_router(
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
named_request_span(
|
||||
r,
|
||||
handle_upsert_safekeeper,
|
||||
RequestName("v1_safekeeper_post"),
|
||||
)
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id/scheduling_policy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_safekeeper_scheduling_policy,
|
||||
RequestName("v1_safekeeper_status"),
|
||||
)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
|
||||
@@ -1104,6 +1104,37 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id_: i64,
|
||||
scheduling_policy_: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
#[derive(Insertable, AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
struct UpdateSkSchedulingPolicy<'a> {
|
||||
id: i64,
|
||||
scheduling_policy: &'a str,
|
||||
}
|
||||
let scheduling_policy_ = String::from(scheduling_policy_);
|
||||
|
||||
let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
|
||||
.set(scheduling_policy.eq(scheduling_policy_))
|
||||
.execute(conn)?;
|
||||
|
||||
if rows_affected != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of rows ({rows_affected})",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
|
||||
@@ -47,7 +47,7 @@ use pageserver_api::{
|
||||
AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
|
||||
NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
|
||||
ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
|
||||
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
|
||||
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
@@ -2109,12 +2109,6 @@ impl Service {
|
||||
create_req.new_tenant_id.tenant_id
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Creating tenant {}, shard_count={:?}",
|
||||
create_req.new_tenant_id,
|
||||
create_req.shard_parameters.count,
|
||||
);
|
||||
|
||||
let create_ids = (0..create_req.shard_parameters.count.count())
|
||||
.map(|i| TenantShardId {
|
||||
tenant_id,
|
||||
@@ -2155,6 +2149,14 @@ impl Service {
|
||||
}
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
generation=?initial_generation,
|
||||
preferred_az_id=?preferred_az_id,
|
||||
tenant_id=%create_req.new_tenant_id,
|
||||
shard_count=?create_req.shard_parameters.count,
|
||||
"Creating tenant",
|
||||
);
|
||||
|
||||
// Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller
|
||||
// to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
|
||||
// during the creation, rather than risking leaving orphan objects in S3.
|
||||
@@ -5411,6 +5413,15 @@ impl Service {
|
||||
|
||||
expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
|
||||
|
||||
// Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig`
|
||||
// definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new
|
||||
// fields are added, before doing a comparison.
|
||||
for tsp in &mut persistent_shards {
|
||||
let config: TenantConfig = serde_json::from_str(&tsp.config)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible");
|
||||
}
|
||||
|
||||
if persistent_shards != expect_shards {
|
||||
tracing::error!("Consistency check failed on shards.");
|
||||
|
||||
@@ -7270,19 +7281,14 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
|
||||
/// 1. The node should be filled until it reaches the expected cluster average of
|
||||
/// attached shards. If there are not enough secondaries on the node, the plan stops early.
|
||||
/// 2. Select tenant shards to promote such that the number of attached shards is balanced
|
||||
/// throughout the cluster. We achieve this by picking tenant shards from each node,
|
||||
/// starting from the ones with the largest number of attached shards, until the node
|
||||
/// reaches the expected cluster average.
|
||||
/// 3. Avoid promoting more shards of the same tenant than required. The upper bound
|
||||
/// for the number of tenants from the same shard promoted to the node being filled is:
|
||||
/// shard count for the tenant divided by the number of nodes in the cluster.
|
||||
/// Create a node fill plan (pick secondaries to promote), based on:
|
||||
/// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
|
||||
/// outside their home AZ, should be migrated back here.
|
||||
/// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of
|
||||
/// attached shards, we will promote more shards from the nodes with the most attached shards, unless
|
||||
/// those shards have a home AZ that doesn't match the node we're filling.
|
||||
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
|
||||
let node_az = nodes
|
||||
@@ -7291,53 +7297,79 @@ impl Service {
|
||||
.get_availability_zone_id()
|
||||
.clone();
|
||||
|
||||
let mut tids_by_node = tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(tid, tenant_shard)| {
|
||||
if !matches!(
|
||||
tenant_shard.get_scheduling_policy(),
|
||||
ShardSchedulingPolicy::Active
|
||||
) {
|
||||
// Only include tenants in fills if they have a normal (Active) scheduling policy. We
|
||||
// even exclude Essential, because moving to fill a node is not essential to keeping this
|
||||
// tenant available.
|
||||
return None;
|
||||
}
|
||||
// The tenant shard IDs that we plan to promote from secondary to attached on this node
|
||||
let mut plan = Vec::new();
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved.
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it is
|
||||
// elegible to be moved: fall through;
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling:
|
||||
// do not include it in the fill plan.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
// Collect shards which do not have a preferred AZ & are elegible for moving in stage 2
|
||||
let mut free_tids_by_node: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
|
||||
if tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
// Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could
|
||||
// conceivably come up in real life if deploying a single-AZ region intentionally.
|
||||
let respect_azs = nodes
|
||||
.values()
|
||||
.map(|n| n.get_availability_zone_id())
|
||||
.unique()
|
||||
.count()
|
||||
> 1;
|
||||
|
||||
// Step 1: collect all shards that we are required to migrate back to this node because their AZ preference
|
||||
// requires it.
|
||||
for (tsid, tenant_shard) in tenants {
|
||||
if !tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
// Shard doesn't have a secondary on this node, ignore it.
|
||||
continue;
|
||||
}
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
_ if !respect_azs => {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
return Some((*primary, *tid));
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved, but we
|
||||
// will only do so if our target shard count requires it.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it should
|
||||
// be moved back to this node as part of filling, unless its currently
|
||||
// attached location is also in its home AZ.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
if nodes
|
||||
.get(primary)
|
||||
.expect("referenced node must exist")
|
||||
.get_availability_zone_id()
|
||||
!= tenant_shard
|
||||
.preferred_az()
|
||||
.expect("tenant must have an AZ preference")
|
||||
{
|
||||
plan.push(*tsid)
|
||||
}
|
||||
} else {
|
||||
plan.push(*tsid)
|
||||
}
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling,
|
||||
// it may not be moved back to this node as part of filling. Ignore it
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
})
|
||||
.into_group_map();
|
||||
// Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
|
||||
let expected_attached = locked.scheduler.expected_attached_shard_count();
|
||||
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
|
||||
|
||||
let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
|
||||
let mut plan = Vec::new();
|
||||
|
||||
for (node_id, attached) in nodes_by_load {
|
||||
let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
|
||||
@@ -7346,7 +7378,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if plan.len() >= fill_requirement
|
||||
|| tids_by_node.is_empty()
|
||||
|| free_tids_by_node.is_empty()
|
||||
|| attached <= expected_attached
|
||||
{
|
||||
break;
|
||||
@@ -7358,7 +7390,7 @@ impl Service {
|
||||
|
||||
let mut remove_node = false;
|
||||
while take > 0 {
|
||||
match tids_by_node.get_mut(&node_id) {
|
||||
match free_tids_by_node.get_mut(&node_id) {
|
||||
Some(tids) => match tids.pop() {
|
||||
Some(tid) => {
|
||||
let max_promote_for_tenant = std::cmp::max(
|
||||
@@ -7384,7 +7416,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if remove_node {
|
||||
tids_by_node.remove(&node_id);
|
||||
free_tids_by_node.remove(&node_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7651,6 +7683,16 @@ impl Service {
|
||||
self.persistence.safekeeper_upsert(record).await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id: i64,
|
||||
scheduling_policy: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
self.persistence
|
||||
.set_safekeeper_scheduling_policy(id, scheduling_policy)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn update_shards_preferred_azs(
|
||||
&self,
|
||||
req: ShardsPreferredAzsRequest,
|
||||
|
||||
@@ -2336,6 +2336,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
json=body,
|
||||
)
|
||||
|
||||
def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
json={"id": id, "scheduling_policy": scheduling_policy},
|
||||
)
|
||||
|
||||
def get_safekeeper(self, id: int) -> dict[str, Any] | None:
|
||||
try:
|
||||
response = self.request(
|
||||
@@ -4135,7 +4143,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
|
||||
# Checkpoints running endpoint and returns pg_wal size in MB.
|
||||
def get_pg_wal_size(self):
|
||||
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
|
||||
log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}")
|
||||
self.safe_psql("checkpoint")
|
||||
assert self.pgdata_dir is not None # please mypy
|
||||
return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
|
||||
@@ -4975,7 +4983,7 @@ def logical_replication_sync(
|
||||
if res:
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
subscriber_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
|
||||
if subscriber_lsn >= publisher_lsn:
|
||||
return subscriber_lsn
|
||||
time.sleep(0.5)
|
||||
|
||||
@@ -53,6 +53,22 @@ class Workload:
|
||||
self._endpoint: Endpoint | None = None
|
||||
self._endpoint_opts = endpoint_opts or {}
|
||||
|
||||
def branch(
|
||||
self,
|
||||
timeline_id: TimelineId,
|
||||
branch_name: str | None = None,
|
||||
endpoint_opts: dict[str, Any] | None = None,
|
||||
) -> Workload:
|
||||
"""
|
||||
Checkpoint the current status of the workload in case of branching
|
||||
"""
|
||||
branch_workload = Workload(
|
||||
self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts
|
||||
)
|
||||
branch_workload.expect_rows = self.expect_rows
|
||||
branch_workload.churn_cursor = self.churn_cursor
|
||||
return branch_workload
|
||||
|
||||
def reconfigure(self) -> None:
|
||||
"""
|
||||
Request the endpoint to reconfigure based on location reported by storage controller
|
||||
|
||||
@@ -112,7 +112,11 @@ page_cache_size=10
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize(
|
||||
"with_branches",
|
||||
["with_branches", "no_branches"],
|
||||
)
|
||||
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str):
|
||||
SMOKE_CONF = {
|
||||
# Run both gc and gc-compaction.
|
||||
"gc_period": "5s",
|
||||
@@ -143,12 +147,17 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
log.info("Writing initial data ...")
|
||||
workload.write_rows(row_count, env.pageserver.id)
|
||||
|
||||
child_workloads: list[Workload] = []
|
||||
|
||||
for i in range(1, churn_rounds + 1):
|
||||
if i % 10 == 0:
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
|
||||
if (i - 1) % 10 == 0:
|
||||
# Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
|
||||
if i % 10 == 5 and with_branches == "with_branches":
|
||||
branch_name = f"child-{i}"
|
||||
branch_timeline_id = env.create_branch(branch_name)
|
||||
child_workloads.append(workload.branch(branch_timeline_id, branch_name))
|
||||
if (i - 1) % 10 == 0 or (i - 1) % 10 == 1:
|
||||
# Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time.
|
||||
ps_http.timeline_compact(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -179,6 +188,9 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
for child_workload in child_workloads:
|
||||
log.info(f"Validating at branch {child_workload.branch_name}")
|
||||
child_workload.validate(env.pageserver.id)
|
||||
|
||||
# Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
|
||||
@@ -4,9 +4,19 @@ import time
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
BTREE_NUM_CYCLEID_PAGES = """
|
||||
WITH raw_pages AS (
|
||||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
|
||||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
|
||||
WITH lsns AS (
|
||||
/*
|
||||
* pg_switch_wal() ensures we have an LSN that
|
||||
* 1. is after any previous modifications, but also,
|
||||
* 2. (critically) is flushed, preventing any issues with waiting for
|
||||
* unflushed WAL in PageServer.
|
||||
*/
|
||||
SELECT pg_switch_wal() as lsn
|
||||
),
|
||||
raw_pages AS (
|
||||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page
|
||||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno,
|
||||
lsns l(lsn)
|
||||
),
|
||||
parsed_pages AS (
|
||||
/* cycle ID is the last 2 bytes of the btree page */
|
||||
@@ -36,7 +46,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
|
||||
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
|
||||
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert (
|
||||
@@ -57,7 +66,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
ses1.execute("DELETE FROM t WHERE id <= 610;")
|
||||
|
||||
# Flush wal, for checking purposes
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
|
||||
@@ -108,8 +116,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
# unpin the btree page, allowing s3's vacuum to complete
|
||||
ses2.execute("FETCH ALL FROM foo;")
|
||||
ses2.execute("ROLLBACK;")
|
||||
# flush WAL to make sure PS is up-to-date
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
# check that our expectations are correct
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
|
||||
@@ -6,14 +6,9 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
|
||||
|
||||
def check_tenant(
|
||||
env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int
|
||||
):
|
||||
def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
|
||||
tenant_id, timeline_id = env.create_tenant()
|
||||
config_lines = [
|
||||
f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
|
||||
]
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
# we rely upon autocommit after each statement
|
||||
res_1 = endpoint.safe_psql_many(
|
||||
queries=[
|
||||
@@ -38,14 +33,7 @@ def check_tenant(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)])
|
||||
# Test both proto versions until we fully migrate.
|
||||
@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
|
||||
def test_normal_work(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
num_timelines: int,
|
||||
num_safekeepers: int,
|
||||
safekeeper_proto_version: int,
|
||||
):
|
||||
def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int):
|
||||
"""
|
||||
Basic test:
|
||||
* create new tenant with a timeline
|
||||
@@ -64,4 +52,4 @@ def test_normal_work(
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
for _ in range(num_timelines):
|
||||
check_tenant(env, pageserver_http, safekeeper_proto_version)
|
||||
check_tenant(env, pageserver_http)
|
||||
|
||||
@@ -3208,6 +3208,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
assert eq_safekeeper_records(body, inserted_now)
|
||||
|
||||
# some small tests for the scheduling policy querying and returning APIs
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Pause"
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Decomissioned"
|
||||
# Ensure idempotency
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
|
||||
|
||||
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
|
||||
compared = [dict(a), dict(b)]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Thread
|
||||
|
||||
import pytest
|
||||
@@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))
|
||||
|
||||
def timeline_create():
|
||||
try:
|
||||
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
|
||||
raise RuntimeError("creation succeeded even though it shouldn't")
|
||||
except ReadTimeout:
|
||||
pass
|
||||
|
||||
Thread(target=timeline_create).start()
|
||||
|
||||
def hit_initdb_upload_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
|
||||
wait_until(hit_initdb_upload_failpoint)
|
||||
|
||||
def creation_connection_timed_out():
|
||||
env.pageserver.assert_log_contains(
|
||||
"POST.*/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
# Wait so that we hit the timeout and the connection is dropped
|
||||
# (But timeline creation still continues)
|
||||
wait_until(creation_connection_timed_out)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
|
||||
raise RuntimeError("creation succeeded even though it shouldn't")
|
||||
|
||||
def tenant_delete():
|
||||
def tenant_delete_inner():
|
||||
@@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
|
||||
|
||||
wait_until(tenant_delete_inner)
|
||||
|
||||
Thread(target=tenant_delete).start()
|
||||
# We will spawn background threads for timeline creation and tenant deletion. They will both
|
||||
# get blocked on our failpoint.
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
create_fut = executor.submit(timeline_create)
|
||||
|
||||
def deletion_arrived():
|
||||
env.pageserver.assert_log_contains(
|
||||
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
|
||||
)
|
||||
def hit_initdb_upload_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
|
||||
wait_until(deletion_arrived)
|
||||
wait_until(hit_initdb_upload_failpoint)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
|
||||
def creation_connection_timed_out():
|
||||
env.pageserver.assert_log_contains(
|
||||
"POST.*/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
# Disable the failpoint and wait for deletion to finish
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
# Wait so that we hit the timeout and the connection is dropped
|
||||
# (But timeline creation still continues)
|
||||
wait_until(creation_connection_timed_out)
|
||||
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
with pytest.raises(ReadTimeout):
|
||||
# Our creation failed from the client's point of view.
|
||||
create_fut.result()
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
|
||||
delete_fut = executor.submit(tenant_delete)
|
||||
|
||||
def deletion_arrived():
|
||||
env.pageserver.assert_log_contains(
|
||||
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
|
||||
)
|
||||
|
||||
wait_until(deletion_arrived)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
|
||||
|
||||
# Disable the failpoint and wait for deletion to finish
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
|
||||
delete_fut.result()
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 210a0ba3af...46082f2088
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: d3141e17a7...dd0b28d6fb
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: f63b141cfb...d674efd776
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: 0f8da73ed0...a8dd6e779d
8
vendor/revisions.json
vendored
8
vendor/revisions.json
vendored
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.2",
|
||||
"0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
|
||||
"a8dd6e779dde907778006adb436b557ad652fb97"
|
||||
],
|
||||
"v16": [
|
||||
"16.6",
|
||||
"f63b141cfb0c813725a6b2574049565bff643018"
|
||||
"d674efd776f59d78e8fa1535bd2f95c3e6984fca"
|
||||
],
|
||||
"v15": [
|
||||
"15.10",
|
||||
"d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
|
||||
"dd0b28d6fbad39e227f3b77296fcca879af8b3a9"
|
||||
],
|
||||
"v14": [
|
||||
"14.15",
|
||||
"210a0ba3afd8134ea910b203f274b165bd4f05d7"
|
||||
"46082f20884f087a2d974b33ac65d63af26142bd"
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user