mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-02 10:10:37 +00:00
Compare commits
28 Commits
release-co
...
lfc_strict
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d40272f783 | ||
|
|
02fc58b878 | ||
|
|
b312a3c320 | ||
|
|
7d761a9d22 | ||
|
|
8bdaee35f3 | ||
|
|
b0f34099f9 | ||
|
|
6975228a76 | ||
|
|
053abff71f | ||
|
|
871e8b325f | ||
|
|
c47c5f4ace | ||
|
|
b0838a68e5 | ||
|
|
8f2ebc0684 | ||
|
|
3a285a046b | ||
|
|
da13154791 | ||
|
|
2e13a3aa7a | ||
|
|
cccc196848 | ||
|
|
e436dcad57 | ||
|
|
21d7b6a258 | ||
|
|
86dbc44db1 | ||
|
|
58f6af6c9a | ||
|
|
7be971081a | ||
|
|
6fe4c6798f | ||
|
|
2eda484ef6 | ||
|
|
c7429af8a0 | ||
|
|
a753349cb0 | ||
|
|
55a68b28a2 | ||
|
|
fb0e2acb2f | ||
|
|
efaec6cdf8 |
91
.github/workflows/_check-codestyle-rust.yml
vendored
Normal file
91
.github/workflows/_check-codestyle-rust.yml
vendored
Normal file
@@ -0,0 +1,91 @@
|
||||
name: Check Codestyle Rust
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build-tools-image:
|
||||
description: "build-tools image"
|
||||
required: true
|
||||
type: string
|
||||
archs:
|
||||
description: "Json array of architectures to run on"
|
||||
type: string
|
||||
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
strategy:
|
||||
matrix:
|
||||
arch: ${{ fromJson(inputs.archs) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
78
.github/workflows/build_and_test.yml
vendored
78
.github/workflows/build_and_test.yml
vendored
@@ -164,77 +164,11 @@ jobs:
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
archs: '["x64", "arm64"]'
|
||||
secrets: inherit
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
@@ -890,7 +824,7 @@ jobs:
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images-dev:
|
||||
needs: [ check-permissions, tag, vm-compute-node-image ]
|
||||
needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
|
||||
33
.github/workflows/pre-merge-checks.yml
vendored
33
.github/workflows/pre-merge-checks.yml
vendored
@@ -1,6 +1,12 @@
|
||||
name: Pre-merge checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/_check-codestyle-python.yml
|
||||
- .github/workflows/_check-codestyle-rust.yml
|
||||
- .github/workflows/build-build-tools-image.yml
|
||||
- .github/workflows/pre-merge-checks.yml
|
||||
merge_group:
|
||||
branches:
|
||||
- main
|
||||
@@ -17,8 +23,10 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
python-changed: ${{ steps.python-src.outputs.any_changed }}
|
||||
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: python-src
|
||||
with:
|
||||
@@ -30,11 +38,25 @@ jobs:
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: rust-src
|
||||
with:
|
||||
files: |
|
||||
.github/workflows/_check-codestyle-rust.yml
|
||||
.github/workflows/build-build-tools-image.yml
|
||||
.github/workflows/pre-merge-checks.yml
|
||||
**/**.rs
|
||||
**/Cargo.toml
|
||||
Cargo.toml
|
||||
Cargo.lock
|
||||
|
||||
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
|
||||
env:
|
||||
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
|
||||
RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
|
||||
run: |
|
||||
echo "${PYTHON_CHANGED_FILES}"
|
||||
echo "${RUST_CHANGED_FILES}"
|
||||
|
||||
build-build-tools-image:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
@@ -55,6 +77,16 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-rust:
|
||||
if: needs.get-changed-files.outputs.rust-changed == 'true'
|
||||
needs: [ get-changed-files, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
archs: '["x64"]'
|
||||
secrets: inherit
|
||||
|
||||
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
|
||||
# Currently we require 2 jobs (checks with exact name):
|
||||
# - conclusion
|
||||
@@ -67,6 +99,7 @@ jobs:
|
||||
needs:
|
||||
- get-changed-files
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Create fake `neon-cloud-e2e` check
|
||||
|
||||
4
Cargo.lock
generated
4
Cargo.lock
generated
@@ -1312,6 +1312,7 @@ dependencies = [
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"vm_monitor",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
@@ -3981,9 +3982,11 @@ name = "pagectl"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
"camino",
|
||||
"clap",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
@@ -4005,6 +4008,7 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
|
||||
@@ -66,6 +66,7 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
@@ -994,24 +995,50 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-build"
|
||||
# Compile "pgx_ulid" extension
|
||||
# Compile "pgx_ulid" extension for v16 and below
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-pgx-ulid-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# doesn't support v17 yet
|
||||
# https://github.com/pksunkara/pgx_ulid/pull/52
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-pgrx12-build"
|
||||
# Compile "pgx_ulid" extension for v17 and up
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1156,6 +1183,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
@@ -51,6 +51,7 @@ tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
prometheus.workspace = true
|
||||
|
||||
postgres_initdb.workspace = true
|
||||
|
||||
@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Parser;
|
||||
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
|
||||
use nix::unistd::Pid;
|
||||
use tracing::{info, info_span, warn, Instrument};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::fs_ext::is_directory_empty;
|
||||
|
||||
#[path = "fast_import/aws_s3_sync.rs"]
|
||||
@@ -41,12 +41,19 @@ mod child_stdio_to_log;
|
||||
#[path = "fast_import/s3_uri.rs"]
|
||||
mod s3_uri;
|
||||
|
||||
const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
|
||||
const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
#[clap(long)]
|
||||
working_directory: Utf8PathBuf,
|
||||
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
|
||||
s3_prefix: s3_uri::S3Uri,
|
||||
s3_prefix: Option<s3_uri::S3Uri>,
|
||||
#[clap(long)]
|
||||
source_connection_string: Option<String>,
|
||||
#[clap(short, long)]
|
||||
interactive: bool,
|
||||
#[clap(long)]
|
||||
pg_bin_dir: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
@@ -77,30 +84,70 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
|
||||
info!("starting");
|
||||
|
||||
let Args {
|
||||
working_directory,
|
||||
s3_prefix,
|
||||
pg_bin_dir,
|
||||
pg_lib_dir,
|
||||
} = Args::parse();
|
||||
let args = Args::parse();
|
||||
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
// Validate arguments
|
||||
if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
|
||||
anyhow::bail!("either s3_prefix or source_connection_string must be specified");
|
||||
}
|
||||
if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
|
||||
anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
|
||||
}
|
||||
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(&aws_config);
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
let working_directory = args.working_directory;
|
||||
let pg_bin_dir = args.pg_bin_dir;
|
||||
let pg_lib_dir = args.pg_lib_dir;
|
||||
|
||||
// Initialize AWS clients only if s3_prefix is specified
|
||||
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
|
||||
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms = aws_sdk_kms::Client::new(&config);
|
||||
(Some(config), Some(kms))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
// Get source connection string either from S3 spec or direct argument
|
||||
let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
};
|
||||
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.unwrap()
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
} else {
|
||||
args.source_connection_string.unwrap()
|
||||
};
|
||||
|
||||
match tokio::fs::create_dir(&working_directory).await {
|
||||
@@ -123,15 +170,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.await
|
||||
.context("create pgdata directory")?;
|
||||
|
||||
//
|
||||
// Setup clients
|
||||
//
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms_client = aws_sdk_kms::Client::new(&aws_config);
|
||||
|
||||
//
|
||||
// Initialize pgdata
|
||||
//
|
||||
let pgbin = pg_bin_dir.join("postgres");
|
||||
let pg_version = match get_pg_version(pgbin.as_ref()) {
|
||||
PostgresMajorVersion::V14 => 14,
|
||||
@@ -170,7 +208,13 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.args(["-c", &format!("max_parallel_workers={nproc}")])
|
||||
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
|
||||
.args(["-c", &format!("max_worker_processes={nproc}")])
|
||||
.args(["-c", "effective_io_concurrency=100"])
|
||||
.args([
|
||||
"-c",
|
||||
&format!(
|
||||
"effective_io_concurrency={}",
|
||||
if cfg!(target_os = "macos") { 0 } else { 100 }
|
||||
),
|
||||
])
|
||||
.env_clear()
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
@@ -185,44 +229,58 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.instrument(info_span!("postgres")),
|
||||
);
|
||||
|
||||
// Create neondb database in the running postgres
|
||||
let restore_pg_connstring =
|
||||
format!("host=localhost port=5432 user={superuser} dbname=postgres");
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
|
||||
if res.is_ok() {
|
||||
info!("postgres is ready, could connect to it");
|
||||
break;
|
||||
if start_time.elapsed() > PG_WAIT_TIMEOUT {
|
||||
error!(
|
||||
"timeout exceeded: failed to poll postgres and create database within 10 minutes"
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
|
||||
Ok((client, connection)) => {
|
||||
// Spawn the connection handling task to maintain the connection
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
warn!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
match client.simple_query("CREATE DATABASE neondb;").await {
|
||||
Ok(_) => {
|
||||
info!("created neondb database");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to create database: {}, retying in {}s",
|
||||
e,
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
info!(
|
||||
"postgres not ready yet, retrying in {}s",
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Decrypt connection string
|
||||
//
|
||||
let source_connection_string = {
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Start the work
|
||||
//
|
||||
let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
|
||||
|
||||
let dumpdir = working_directory.join("dumpdir");
|
||||
|
||||
@@ -310,6 +368,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// If interactive mode, wait for Ctrl+C
|
||||
if args.interactive {
|
||||
info!("Running in interactive mode. Press Ctrl+C to shut down.");
|
||||
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
|
||||
}
|
||||
|
||||
info!("shutdown postgres");
|
||||
{
|
||||
nix::sys::signal::kill(
|
||||
@@ -325,21 +389,24 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.context("wait for postgres to shut down")?;
|
||||
}
|
||||
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync dump directory to destination")?;
|
||||
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
// Only sync if s3_prefix was specified
|
||||
if let Some(s3_prefix) = args.s3_prefix {
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
.context("sync dump directory to destination")?;
|
||||
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -17,7 +17,8 @@ use crate::{
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct ExtensionServerParams {
|
||||
is_library: Option<bool>,
|
||||
#[serde(default)]
|
||||
is_library: bool,
|
||||
}
|
||||
|
||||
/// Download a remote extension.
|
||||
@@ -51,7 +52,7 @@ pub(in crate::http) async fn download_extension(
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
params.is_library.unwrap_or(false),
|
||||
params.is_library,
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
)
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
use std::{
|
||||
net::{IpAddr, Ipv6Addr, SocketAddr},
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
sync::Arc,
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use axum::{
|
||||
extract::Request,
|
||||
middleware::{self, Next},
|
||||
response::{IntoResponse, Response},
|
||||
routing::{get, post},
|
||||
Router,
|
||||
@@ -17,11 +16,9 @@ use axum::{
|
||||
use http::StatusCode;
|
||||
use tokio::net::TcpListener;
|
||||
use tower::ServiceBuilder;
|
||||
use tower_http::{
|
||||
request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
|
||||
trace::TraceLayer,
|
||||
};
|
||||
use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
|
||||
use tracing::{debug, error, info, Span};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
@@ -34,30 +31,24 @@ async fn handle_404() -> Response {
|
||||
StatusCode::NOT_FOUND.into_response()
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct ComputeMakeRequestId(Arc<AtomicU64>);
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
impl MakeRequestId for ComputeMakeRequestId {
|
||||
fn make_request_id<B>(
|
||||
&mut self,
|
||||
_request: &http::Request<B>,
|
||||
) -> Option<tower_http::request_id::RequestId> {
|
||||
let request_id = self
|
||||
.0
|
||||
.fetch_add(1, Ordering::SeqCst)
|
||||
.to_string()
|
||||
.parse()
|
||||
.unwrap();
|
||||
/// This middleware function allows compute_ctl to generate its own request ID
|
||||
/// if one isn't supplied. The control plane will always send one as a UUID. The
|
||||
/// neon Postgres extension on the other hand does not send one.
|
||||
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
|
||||
let headers = request.headers_mut();
|
||||
|
||||
Some(RequestId::new(request_id))
|
||||
if headers.get(X_REQUEST_ID).is_none() {
|
||||
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
|
||||
}
|
||||
|
||||
next.run(request).await
|
||||
}
|
||||
|
||||
/// Run the HTTP server and wait on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
let mut app = Router::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
@@ -82,9 +73,8 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
.fallback(handle_404)
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
.layer(SetRequestIdLayer::x_request_id(
|
||||
ComputeMakeRequestId::default(),
|
||||
))
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
|
||||
@@ -9,8 +9,9 @@ use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
|
||||
TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -231,6 +232,13 @@ enum Command {
|
||||
},
|
||||
/// List safekeepers known to the storage controller
|
||||
Safekeepers {},
|
||||
/// Set the scheduling policy of the specified safekeeper
|
||||
SafekeeperScheduling {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
scheduling_policy: SkSchedulingPolicyArg,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -283,6 +291,17 @@ impl FromStr for PlacementPolicyArg {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct SkSchedulingPolicyArg(SkSchedulingPolicy);
|
||||
|
||||
impl FromStr for SkSchedulingPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
SkSchedulingPolicy::from_str(s).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||
|
||||
@@ -1202,6 +1221,23 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::SafekeeperScheduling {
|
||||
node_id,
|
||||
scheduling_policy,
|
||||
} => {
|
||||
let scheduling_policy = scheduling_policy.0;
|
||||
storcon_client
|
||||
.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
|
||||
Method::POST,
|
||||
format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
|
||||
Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Scheduling policy of {node_id} set to {}",
|
||||
String::from(scheduling_policy)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -324,7 +324,7 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum SkSchedulingPolicy {
|
||||
Active,
|
||||
Disabled,
|
||||
Pause,
|
||||
Decomissioned,
|
||||
}
|
||||
|
||||
@@ -334,9 +334,13 @@ impl FromStr for SkSchedulingPolicy {
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(match s {
|
||||
"active" => Self::Active,
|
||||
"disabled" => Self::Disabled,
|
||||
"pause" => Self::Pause,
|
||||
"decomissioned" => Self::Decomissioned,
|
||||
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,pause,decomissioned"
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -346,7 +350,7 @@ impl From<SkSchedulingPolicy> for String {
|
||||
use SkSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Disabled => "disabled",
|
||||
Pause => "pause",
|
||||
Decomissioned => "decomissioned",
|
||||
}
|
||||
.to_string()
|
||||
@@ -416,8 +420,6 @@ pub struct MetadataHealthListOutdatedResponse {
|
||||
}
|
||||
|
||||
/// Publicly exposed safekeeper description
|
||||
///
|
||||
/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperDescribeResponse {
|
||||
pub id: NodeId,
|
||||
@@ -433,6 +435,11 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
@@ -24,7 +24,9 @@ pub struct Key {
|
||||
|
||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
||||
/// a struct of fields.
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
|
||||
#[derive(
|
||||
Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
|
||||
)]
|
||||
pub struct CompactKey(i128);
|
||||
|
||||
/// The storage key size.
|
||||
|
||||
@@ -29,11 +29,10 @@ use utils::{
|
||||
};
|
||||
|
||||
use crate::{
|
||||
key::Key,
|
||||
key::{CompactKey, Key},
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
@@ -1400,6 +1399,8 @@ pub enum PagestreamFeMessage {
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -1411,6 +1412,22 @@ pub enum PagestreamBeMessage {
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamFeMessageTag {
|
||||
Exists = 0,
|
||||
Nblocks = 1,
|
||||
GetPage = 2,
|
||||
DbSize = 3,
|
||||
GetSlruSegment = 4,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 99,
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
@@ -1422,7 +1439,28 @@ enum PagestreamBeMessageTag {
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 199,
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamFeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
0 => Ok(PagestreamFeMessageTag::Exists),
|
||||
1 => Ok(PagestreamFeMessageTag::Nblocks),
|
||||
2 => Ok(PagestreamFeMessageTag::GetPage),
|
||||
3 => Ok(PagestreamFeMessageTag::DbSize),
|
||||
4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
99 => Ok(PagestreamFeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
@@ -1433,6 +1471,8 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
199 => Ok(PagestreamBeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
@@ -1550,6 +1590,20 @@ pub struct PagestreamDbSizeResponse {
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct PagestreamTestRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub batch_key: u64,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamTestResponse {
|
||||
pub req: PagestreamTestRequest,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -1569,7 +1623,7 @@ impl PagestreamFeMessage {
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(0);
|
||||
bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1580,7 +1634,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(1);
|
||||
bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1591,7 +1645,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(2);
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1603,7 +1657,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(3);
|
||||
bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1611,13 +1665,24 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(4);
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Test as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.batch_key);
|
||||
let message = req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -1645,56 +1710,66 @@ impl PagestreamFeMessage {
|
||||
),
|
||||
};
|
||||
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
match PagestreamFeMessageTag::try_from(msg_tag)
|
||||
.map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
|
||||
{
|
||||
PagestreamFeMessageTag::Exists => {
|
||||
Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::Nblocks => {
|
||||
Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetPage => {
|
||||
Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::DbSize => {
|
||||
Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
@@ -1705,7 +1780,21 @@ impl PagestreamFeMessage {
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key: body.read_u64::<BigEndian>()?,
|
||||
message: {
|
||||
let len = body.read_u64::<BigEndian>()?;
|
||||
let mut buf = vec![0; len as usize];
|
||||
body.read_exact(&mut buf)?;
|
||||
String::from_utf8(buf)?
|
||||
},
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1748,6 +1837,15 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
PagestreamProtocolVersion::V3 => {
|
||||
@@ -1816,6 +1914,18 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1958,6 +2068,28 @@ impl PagestreamBeMessage {
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Tag::Test => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let batch_key = buf.read_u64::<BigEndian>()?;
|
||||
let len = buf.read_u64::<BigEndian>()?;
|
||||
let mut msg = vec![0; len as usize];
|
||||
buf.read_exact(&mut msg)?;
|
||||
let message = String::from_utf8(msg)?;
|
||||
Self::Test(PagestreamTestResponse {
|
||||
req: PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key,
|
||||
message,
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
@@ -1977,6 +2109,25 @@ impl PagestreamBeMessage {
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(_) => "Test",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PageTraceEvent {
|
||||
pub key: CompactKey,
|
||||
pub effective_lsn: Lsn,
|
||||
pub time: SystemTime,
|
||||
}
|
||||
|
||||
impl Default for PageTraceEvent {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
key: Default::default(),
|
||||
effective_lsn: Default::default(),
|
||||
time: std::time::UNIX_EPOCH,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -277,3 +277,8 @@ pub struct TimelineTermBumpResponse {
|
||||
pub previous_term: u64,
|
||||
pub current_term: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SafekeeperUtilization {
|
||||
pub timeline_count: u64,
|
||||
}
|
||||
|
||||
54
libs/utils/src/guard_arc_swap.rs
Normal file
54
libs/utils/src/guard_arc_swap.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
|
||||
//! don't block reads.
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::TryLockError;
|
||||
|
||||
pub struct GuardArcSwap<T> {
|
||||
inner: ArcSwap<T>,
|
||||
guard: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
pub struct Guard<'a, T> {
|
||||
_guard: tokio::sync::MutexGuard<'a, ()>,
|
||||
inner: &'a ArcSwap<T>,
|
||||
}
|
||||
|
||||
impl<T> GuardArcSwap<T> {
|
||||
pub fn new(inner: T) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(inner)),
|
||||
guard: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub async fn write_guard(&self) -> Guard<'_, T> {
|
||||
Guard {
|
||||
_guard: self.guard.lock().await,
|
||||
inner: &self.inner,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
|
||||
let guard = self.guard.try_lock()?;
|
||||
Ok(Guard {
|
||||
_guard: guard,
|
||||
inner: &self.inner,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Guard<'_, T> {
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub fn write(&mut self, value: T) {
|
||||
self.inner.store(Arc::new(value));
|
||||
}
|
||||
}
|
||||
@@ -98,6 +98,8 @@ pub mod try_rcu;
|
||||
|
||||
pub mod pprof;
|
||||
|
||||
pub mod guard_arc_swap;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -16,6 +16,7 @@ arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
bit_field.workspace = true
|
||||
bincode.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
@@ -113,3 +114,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
harness = false
|
||||
|
||||
[[bin]]
|
||||
name = "test_helper_slow_client_reads"
|
||||
required-features = [ "testing" ]
|
||||
|
||||
@@ -4,6 +4,9 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = [ "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use futures::SinkExt;
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
@@ -10,7 +13,6 @@ use pageserver_api::{
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -62,15 +64,28 @@ impl Client {
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
|
||||
ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
},
|
||||
)));
|
||||
Ok(PagestreamClient {
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
sink: PagestreamSender {
|
||||
shared: shared.clone(),
|
||||
sink,
|
||||
},
|
||||
stream: PagestreamReceiver {
|
||||
shared: shared.clone(),
|
||||
stream,
|
||||
},
|
||||
shared,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -97,7 +112,28 @@ impl Client {
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: PagestreamSender,
|
||||
stream: PagestreamReceiver,
|
||||
}
|
||||
|
||||
pub struct PagestreamSender {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
|
||||
}
|
||||
|
||||
pub struct PagestreamReceiver {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
|
||||
}
|
||||
|
||||
enum PagestreamShared {
|
||||
ConnTaskRunning(ConnTaskRunning),
|
||||
ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
}
|
||||
struct ConnTaskRunning {
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
@@ -110,11 +146,11 @@ pub struct RelTagBlockNo {
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(self) {
|
||||
let Self {
|
||||
copy_both,
|
||||
cancel_on_client_drop: cancel_conn_task,
|
||||
conn_task,
|
||||
} = self;
|
||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
shared,
|
||||
sink,
|
||||
stream,
|
||||
} = { self };
|
||||
// The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
||||
//
|
||||
@@ -131,27 +167,77 @@ impl PagestreamClient {
|
||||
//
|
||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
||||
// => https://github.com/neondatabase/neon/issues/6390
|
||||
let _ = cancel_conn_task.unwrap();
|
||||
let ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
} = {
|
||||
let mut guard = shared.lock().unwrap();
|
||||
match std::mem::replace(
|
||||
&mut *guard,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
) {
|
||||
PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
|
||||
}
|
||||
};
|
||||
let _ = cancel_on_client_drop.unwrap();
|
||||
conn_task.await.unwrap();
|
||||
drop(copy_both);
|
||||
|
||||
// Now drop the split copy_both.
|
||||
drop(sink);
|
||||
drop(stream);
|
||||
}
|
||||
|
||||
pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
|
||||
let Self {
|
||||
shared: _,
|
||||
sink,
|
||||
stream,
|
||||
} = self;
|
||||
(sink, stream)
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
self.getpage_send(req).await?;
|
||||
self.getpage_recv().await
|
||||
}
|
||||
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.sink.getpage_send(req).await
|
||||
}
|
||||
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.stream.getpage_recv().await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamSender {
|
||||
// TODO: maybe make this impl Sink instead for better composability?
|
||||
pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
|
||||
let msg = msg.serialize();
|
||||
self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.send(PagestreamFeMessage::GetPage(req)).await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamReceiver {
|
||||
// TODO: maybe make this impl Stream instead for better composability?
|
||||
pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
PagestreamBeMessage::deserialize(next)
|
||||
}
|
||||
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let next: PagestreamBeMessage = self.recv().await?;
|
||||
match next {
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
@@ -160,7 +246,14 @@ impl PagestreamClient {
|
||||
| PagestreamBeMessage::GetSlruSegment(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
msg.kind()
|
||||
next.kind()
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamBeMessage::Test(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,9 +8,11 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
humantime.workspace = true
|
||||
itertools.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
|
||||
@@ -9,7 +9,9 @@ mod index_part;
|
||||
mod key;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
mod page_trace;
|
||||
|
||||
use page_trace::PageTraceCmd;
|
||||
use std::{
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -64,6 +66,7 @@ enum Commands {
|
||||
Layer(LayerCmd),
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
PageTrace(PageTraceCmd),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
73
pageserver/ctl/src/page_trace.rs
Normal file
73
pageserver/ctl/src/page_trace.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::BufReader;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::key::{CompactKey, Key};
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats.
|
||||
#[derive(Parser)]
|
||||
pub(crate) struct PageTraceCmd {
|
||||
/// Trace input file.
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> {
|
||||
let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?);
|
||||
let mut events: Vec<PageTraceEvent> = Vec::new();
|
||||
loop {
|
||||
match bincode::deserialize_from(&mut file) {
|
||||
Ok(event) => events.push(event),
|
||||
Err(err) => {
|
||||
if let bincode::ErrorKind::Io(ref err) = *err {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Err(err.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut reads_by_relation: HashMap<RelTag, i64> = HashMap::new();
|
||||
let mut reads_by_key: HashMap<CompactKey, i64> = HashMap::new();
|
||||
|
||||
for event in events {
|
||||
let key = Key::from_compact(event.key);
|
||||
let reltag = RelTag {
|
||||
spcnode: key.field2,
|
||||
dbnode: key.field3,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
};
|
||||
|
||||
*reads_by_relation.entry(reltag).or_default() += 1;
|
||||
*reads_by_key.entry(event.key).or_default() += 1;
|
||||
}
|
||||
|
||||
let multi_read_keys = reads_by_key
|
||||
.into_iter()
|
||||
.filter(|(_, count)| *count > 1)
|
||||
.sorted_by_key(|(key, count)| (-*count, *key))
|
||||
.collect_vec();
|
||||
|
||||
println!("Multi-read keys: {}", multi_read_keys.len());
|
||||
for (key, count) in multi_read_keys {
|
||||
println!(" {key}: {count}");
|
||||
}
|
||||
|
||||
let reads_by_relation = reads_by_relation
|
||||
.into_iter()
|
||||
.sorted_by_key(|(rel, count)| (-*count, *rel))
|
||||
.collect_vec();
|
||||
|
||||
println!("Reads by relation:");
|
||||
for (reltag, count) in reads_by_relation {
|
||||
println!(" {reltag}: {count}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
65
pageserver/src/bin/test_helper_slow_client_reads.rs
Normal file
65
pageserver/src/bin/test_helper_slow_client_reads.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use std::{
|
||||
io::{stdin, stdout, Read, Write},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
connstr: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let Args {
|
||||
connstr,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = Args::parse();
|
||||
let client = pageserver_client::page_service::Client::new(connstr).await?;
|
||||
let client = client.pagestream(tenant_id, timeline_id).await?;
|
||||
let (mut sender, _receiver) = client.split();
|
||||
|
||||
eprintln!("filling the pipe");
|
||||
let mut msg = 0;
|
||||
loop {
|
||||
msg += 1;
|
||||
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
|
||||
PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(23),
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
},
|
||||
));
|
||||
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
|
||||
eprintln!("pipe seems full");
|
||||
break;
|
||||
};
|
||||
let _: () = res?;
|
||||
}
|
||||
|
||||
let n = stdout().write(b"R")?;
|
||||
assert_eq!(n, 1);
|
||||
stdout().flush()?;
|
||||
|
||||
eprintln!("waiting for signal to tell us to exit");
|
||||
|
||||
let mut buf = [0u8; 1];
|
||||
stdin().read_exact(&mut buf)?;
|
||||
|
||||
eprintln!("termination signal received, exiting");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -27,6 +27,7 @@ use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::OffloadedTimelineInfo;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantConfigPatchRequest;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
@@ -51,7 +52,9 @@ use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
use scopeguard::defer;
|
||||
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -1521,6 +1524,71 @@ async fn timeline_gc_unblocking_handler(
|
||||
block_or_unblock_gc(request, false).await
|
||||
}
|
||||
|
||||
/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding.
|
||||
/// Use the `pagectl page-trace` command to decode and analyze the output.
|
||||
async fn timeline_page_trace_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024);
|
||||
let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5);
|
||||
|
||||
// Convert size limit to event limit based on the serialized size of an event. The event size is
|
||||
// fixed, as the default bincode serializer uses fixed-width integer encoding.
|
||||
let event_size = bincode::serialize(&PageTraceEvent::default())
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
.len();
|
||||
let event_limit = size_limit / event_size;
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
// Install a page trace, unless one is already in progress. We just use a buffered channel,
|
||||
// which may 2x the memory usage in the worst case, but it's still bounded.
|
||||
let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit);
|
||||
let cur = timeline.page_trace.load();
|
||||
let installed = cur.is_none()
|
||||
&& timeline
|
||||
.page_trace
|
||||
.compare_and_swap(cur, Some(Arc::new(trace_tx)))
|
||||
.is_none();
|
||||
if !installed {
|
||||
return Err(ApiError::Conflict("page trace already active".to_string()));
|
||||
}
|
||||
defer!(timeline.page_trace.store(None)); // uninstall on return
|
||||
|
||||
// Collect the trace and return it to the client. We could stream the response, but this is
|
||||
// simple and fine.
|
||||
let mut body = Vec::with_capacity(size_limit);
|
||||
let deadline = Instant::now() + Duration::from_secs(time_limit_secs);
|
||||
|
||||
while body.len() < size_limit {
|
||||
tokio::select! {
|
||||
event = trace_rx.recv() => {
|
||||
let Some(event) = event else {
|
||||
break; // shouldn't happen (sender doesn't close, unless timeline dropped)
|
||||
};
|
||||
bincode::serialize_into(&mut body, &event)
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
}
|
||||
_ = tokio::time::sleep_until(deadline) => break, // time limit reached
|
||||
_ = cancel.cancelled() => return Err(ApiError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(body))
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
|
||||
///
|
||||
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
|
||||
@@ -3479,6 +3547,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| api_handler(r, timeline_gc_unblocking_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace",
|
||||
|r| api_handler(r, timeline_page_trace_handler),
|
||||
)
|
||||
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
|
||||
api_handler(r, secondary_upload_handler)
|
||||
})
|
||||
|
||||
@@ -100,6 +100,32 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_concurrent_initdb",
|
||||
"Number of initdb processes running"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_initdb_semaphore_seconds_global",
|
||||
"Time spent getting a permit from the global initdb semaphore",
|
||||
STORAGE_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define metric")
|
||||
});
|
||||
|
||||
pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_initdb_seconds_global",
|
||||
"Time spent performing initdb",
|
||||
STORAGE_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
#[derive(
|
||||
Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
|
||||
@@ -1463,6 +1489,8 @@ pub enum SmgrQueryType {
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
GetSlruSegment,
|
||||
#[cfg(feature = "testing")]
|
||||
Test,
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
|
||||
@@ -67,6 +67,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::{basebackup, timed_after_cancellation};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -554,37 +555,52 @@ struct BatchedGetPageRequest {
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct BatchedTestRequest {
|
||||
req: models::PagestreamTestRequest,
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
|
||||
/// so that we don't keep the [`Timeline::gate`] open while the batch
|
||||
/// is being built up inside the [`spsc_fold`] (pagestream pipelining).
|
||||
enum BatchedFeMessage {
|
||||
Exists {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamExistsRequest,
|
||||
},
|
||||
Nblocks {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamNblocksRequest,
|
||||
},
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
},
|
||||
DbSize {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamDbSizeRequest,
|
||||
},
|
||||
GetSlruSegment {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamGetSlruSegmentRequest,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
span: Span,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
},
|
||||
RespondError {
|
||||
span: Span,
|
||||
error: BatchedPageStreamError,
|
||||
@@ -605,6 +621,12 @@ impl BatchedFeMessage {
|
||||
page.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test { requests, .. } => {
|
||||
for req in requests {
|
||||
req.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
}
|
||||
}
|
||||
@@ -734,7 +756,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -753,7 +775,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::Nblocks {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -772,7 +794,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::DbSize {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -791,7 +813,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -843,6 +865,7 @@ impl PageServerHandler {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// We're holding the Handle
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
req.hdr.request_lsn,
|
||||
@@ -860,11 +883,27 @@ impl PageServerHandler {
|
||||
};
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessage::Test(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer =
|
||||
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
|
||||
.await?;
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
requests: vec![BatchedTestRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Some(batched_msg))
|
||||
}
|
||||
@@ -906,9 +945,7 @@ impl PageServerHandler {
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
|
||||
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
|
||||
{
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
@@ -927,6 +964,44 @@ impl PageServerHandler {
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
Ok(BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
}),
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) if (|| {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
@@ -968,7 +1043,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_rel_exists_request(&shard, &req, ctx)
|
||||
.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -985,7 +1060,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_nblocks_request(&shard, &req, ctx)
|
||||
.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1006,7 +1081,7 @@ impl PageServerHandler {
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
&*shard.upgrade()?,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
ctx,
|
||||
@@ -1028,7 +1103,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
(
|
||||
vec![self
|
||||
.handle_db_size_request(&shard, &req, ctx)
|
||||
.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1045,7 +1120,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_slru_segment_request(&shard, &req, ctx)
|
||||
.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1053,6 +1128,27 @@ impl PageServerHandler {
|
||||
span,
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard,
|
||||
requests,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::test");
|
||||
(
|
||||
{
|
||||
let npages = requests.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::RespondError { span, error } => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
@@ -1718,6 +1814,20 @@ impl PageServerHandler {
|
||||
.query_metrics
|
||||
.observe_getpage_batch_start(requests.len());
|
||||
|
||||
// If a page trace is running, submit an event for this request.
|
||||
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
|
||||
let time = SystemTime::now();
|
||||
for batch in &requests {
|
||||
let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact();
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
|
||||
@@ -1776,6 +1886,51 @@ impl PageServerHandler {
|
||||
))
|
||||
}
|
||||
|
||||
// NB: this impl mimics what we do for batched getpage requests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_test_request_batch(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
// real requests would do something with the timeline
|
||||
let mut results = Vec::with_capacity(requests.len());
|
||||
for _req in requests.iter() {
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
results.push({
|
||||
if timeline.cancel.is_cancelled() {
|
||||
Err(PageReconstructError::Cancelled)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: avoid creating the new Vec here
|
||||
Vec::from_iter(
|
||||
requests
|
||||
.into_iter()
|
||||
.zip(results.into_iter())
|
||||
.map(|(req, res)| {
|
||||
res.map(|()| {
|
||||
(
|
||||
PagestreamBeMessage::Test(models::PagestreamTestResponse {
|
||||
req: req.req.clone(),
|
||||
}),
|
||||
req.timer,
|
||||
)
|
||||
})
|
||||
.map_err(|e| BatchedPageStreamError {
|
||||
err: PageStreamError::from(e),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// Note on "fullbackup":
|
||||
/// Full basebackups should only be used for debugging purposes.
|
||||
/// Originally, it was introduced to enable breaking storage format changes,
|
||||
@@ -2391,6 +2546,14 @@ impl From<GetActiveTimelineError> for QueryError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::timeline::handle::HandleUpgradeError> for QueryError {
|
||||
fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self {
|
||||
match e {
|
||||
crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_tracing_field_shard_id(timeline: &Timeline) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
tracing::Span::current().record(
|
||||
|
||||
@@ -95,6 +95,9 @@ use crate::deletion_queue::DeletionQueueError;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::l0_flush::L0FlushGlobalState;
|
||||
use crate::metrics::CONCURRENT_INITDBS;
|
||||
use crate::metrics::INITDB_RUN_TIME;
|
||||
use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME;
|
||||
use crate::metrics::TENANT;
|
||||
use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
@@ -5347,8 +5350,17 @@ async fn run_initdb(
|
||||
initdb_bin_path, initdb_target_dir, initdb_lib_dir,
|
||||
);
|
||||
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
let _permit = {
|
||||
let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
|
||||
INIT_DB_SEMAPHORE.acquire().await
|
||||
};
|
||||
|
||||
CONCURRENT_INITDBS.inc();
|
||||
scopeguard::defer! {
|
||||
CONCURRENT_INITDBS.dec();
|
||||
}
|
||||
|
||||
let _timer = INITDB_RUN_TIME.start_timer();
|
||||
let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
|
||||
superuser: &conf.superuser,
|
||||
locale: &conf.locale,
|
||||
|
||||
@@ -382,6 +382,12 @@ pub(crate) struct RemoteTimelineClient {
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClient {
|
||||
fn drop(&mut self) {
|
||||
debug!("dropping RemoteTimelineClient");
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
@@ -797,6 +803,12 @@ impl RemoteTimelineClient {
|
||||
|
||||
upload_queue.dirty.metadata.apply(update);
|
||||
|
||||
// Defense in depth: if we somehow generated invalid metadata, do not persist it.
|
||||
upload_queue
|
||||
.dirty
|
||||
.validate()
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -152,6 +152,21 @@ impl IndexPart {
|
||||
};
|
||||
is_same_remote_layer_path(name, metadata, name, index_metadata)
|
||||
}
|
||||
|
||||
/// Check for invariants in the index: this is useful when uploading an index to ensure that if
|
||||
/// we encounter a bug, we do not persist buggy metadata.
|
||||
pub(crate) fn validate(&self) -> Result<(), String> {
|
||||
if self.import_pgdata.is_none()
|
||||
&& self.metadata.ancestor_timeline().is_none()
|
||||
&& self.layer_metadata.is_empty()
|
||||
{
|
||||
// Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
|
||||
// always have at least one layer.
|
||||
return Err("Index has no ancestor and no layers".to_string());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
|
||||
@@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part(
|
||||
});
|
||||
pausable_failpoint!("before-upload-index-pausable");
|
||||
|
||||
// Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this
|
||||
// (this should never happen)
|
||||
index_part.validate().map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
// FIXME: this error comes too late
|
||||
let serialized = index_part.to_json_bytes()?;
|
||||
let serialized = Bytes::from(serialized);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use pageserver_api::key::CONTROLFILE_KEY;
|
||||
use pageserver_api::key::{Key, CONTROLFILE_KEY};
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{
|
||||
completion::{self, Completion},
|
||||
@@ -9,7 +9,10 @@ use utils::{
|
||||
|
||||
use super::failpoints::{Failpoint, FailpointKind};
|
||||
use super::*;
|
||||
use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
|
||||
};
|
||||
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
@@ -31,20 +34,51 @@ async fn smoke_test() {
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
|
||||
let image_layers = vec![(
|
||||
Lsn(0x40),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
test_img("foo"),
|
||||
)],
|
||||
)];
|
||||
|
||||
// Create a test timeline with one real layer, and one synthetic test layer. The synthetic
|
||||
// one is only there so that we can GC the real one without leaving the timeline's metadata
|
||||
// empty, which is an illegal state (see [`IndexPart::validate`]).
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Default::default(),
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
// Grab one of the timeline's layers to exercise in the test, and the other layer that is just
|
||||
// there to avoid the timeline being illegally empty
|
||||
let (layer, dummy_layer) = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
layers.swap_remove(0)
|
||||
layers.sort_by_key(|l| l.layer_desc().get_key_range().start);
|
||||
let synthetic_layer = layers.pop().unwrap();
|
||||
let real_layer = layers.pop().unwrap();
|
||||
tracing::info!(
|
||||
"real_layer={:?} ({}), synthetic_layer={:?} ({})",
|
||||
real_layer,
|
||||
real_layer.layer_desc().file_size,
|
||||
synthetic_layer,
|
||||
synthetic_layer.layer_desc().file_size
|
||||
);
|
||||
(real_layer, synthetic_layer)
|
||||
};
|
||||
|
||||
// all layers created at pageserver are like `layer`, initialized with strong
|
||||
@@ -173,10 +207,13 @@ async fn smoke_test() {
|
||||
|
||||
let rtc = &timeline.remote_client;
|
||||
|
||||
// Simulate GC removing our test layer.
|
||||
{
|
||||
let layers = &[layer];
|
||||
let mut g = timeline.layers.write().await;
|
||||
|
||||
let layers = &[layer];
|
||||
g.open_mut().unwrap().finish_gc_timeline(layers);
|
||||
|
||||
// this just updates the remote_physical_size for demonstration purposes
|
||||
rtc.schedule_gc_update(layers).unwrap();
|
||||
}
|
||||
@@ -191,7 +228,10 @@ async fn smoke_test() {
|
||||
|
||||
rtc.wait_completion().await.unwrap();
|
||||
|
||||
assert_eq!(rtc.get_remote_physical_size(), 0);
|
||||
assert_eq!(
|
||||
rtc.get_remote_physical_size(),
|
||||
dummy_layer.metadata().file_size
|
||||
);
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use arc_swap::ArcSwap;
|
||||
use arc_swap::{ArcSwap, ArcSwapOption};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use chrono::{DateTime, Utc};
|
||||
@@ -23,6 +23,7 @@ use fail::fail_point;
|
||||
use handle::ShardTimelineId;
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
@@ -42,6 +43,7 @@ use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch},
|
||||
@@ -49,7 +51,9 @@ use tokio::{
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
fs_ext, pausable_failpoint,
|
||||
fs_ext,
|
||||
guard_arc_swap::GuardArcSwap,
|
||||
pausable_failpoint,
|
||||
postgres_client::PostgresClientProtocol,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
@@ -72,6 +76,7 @@ use std::{pin::pin, sync::OnceLock};
|
||||
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
page_service::TenantManagerTypes,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -351,8 +356,8 @@ pub struct Timeline {
|
||||
// though let's keep them both for better error visibility.
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// When did we last calculate the partitioning? Make it pub to test cases.
|
||||
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
/// The repartitioning result. Allows a single writer and multiple readers.
|
||||
pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -427,12 +432,15 @@ pub struct Timeline {
|
||||
|
||||
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||
pub(crate) handles: handle::PerTimelineState<TenantManagerTypes>,
|
||||
|
||||
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
|
||||
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
|
||||
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
|
||||
/// If Some, collects GetPage metadata for an ongoing PageTrace.
|
||||
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
|
||||
}
|
||||
|
||||
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
||||
@@ -2335,7 +2343,8 @@ impl Timeline {
|
||||
// initial logical size is 0.
|
||||
LogicalSize::empty_initial()
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((
|
||||
|
||||
partitioning: GuardArcSwap::new((
|
||||
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
|
||||
Lsn(0),
|
||||
)),
|
||||
@@ -2380,6 +2389,8 @@ impl Timeline {
|
||||
attach_wal_lag_cooldown,
|
||||
|
||||
create_idempotency,
|
||||
|
||||
page_trace: Default::default(),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -4021,18 +4032,15 @@ impl Timeline {
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
let Ok(mut guard) = self.partitioning.try_write_guard() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
// Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
|
||||
// gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
|
||||
// heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called concurrently, this is rare and a retry should be fine"
|
||||
"repartition() called concurrently"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
@@ -4060,9 +4068,9 @@ impl Timeline {
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
}; // no partitioning for metadata keys for now
|
||||
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
|
||||
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
|
||||
let result = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
guard.write(result.clone());
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
@@ -4618,6 +4626,10 @@ impl Drop for Timeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Timeline {} for tenant {} is being dropped",
|
||||
self.timeline_id, self.tenant_shard_id.tenant_id
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5666,9 +5678,17 @@ impl Timeline {
|
||||
info!("force created image layer {}", image_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.open_mut().unwrap().force_insert_layer(image_layer);
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
.force_insert_layer(image_layer.clone());
|
||||
}
|
||||
|
||||
// Update remote_timeline_client state to reflect existence of this layer
|
||||
self.remote_client
|
||||
.schedule_layer_file_upload(image_layer)
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5719,9 +5739,17 @@ impl Timeline {
|
||||
info!("force created delta layer {}", delta_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.open_mut().unwrap().force_insert_layer(delta_layer);
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
.force_insert_layer(delta_layer.clone());
|
||||
}
|
||||
|
||||
// Update remote_timeline_client state to reflect existence of this layer
|
||||
self.remote_client
|
||||
.schedule_layer_file_upload(delta_layer)
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -1776,7 +1776,10 @@ impl Timeline {
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
if cfg!(debug_assertions) {
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
if debug_mode {
|
||||
for (log_key, _, _) in full_history {
|
||||
assert_eq!(log_key, &key, "mismatched key");
|
||||
}
|
||||
@@ -1922,15 +1925,19 @@ impl Timeline {
|
||||
output
|
||||
}
|
||||
|
||||
let mut key_exists = false;
|
||||
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
|
||||
// TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
|
||||
records_since_last_image += split_for_lsn.len();
|
||||
let generate_image = if i == 0 && !has_ancestor {
|
||||
// Whether to produce an image into the final layer files
|
||||
let produce_image = if i == 0 && !has_ancestor {
|
||||
// We always generate images for the first batch (below horizon / lowest retain_lsn)
|
||||
true
|
||||
} else if i == batch_cnt - 1 {
|
||||
// Do not generate images for the last batch (above horizon)
|
||||
false
|
||||
} else if records_since_last_image == 0 {
|
||||
false
|
||||
} else if records_since_last_image >= delta_threshold_cnt {
|
||||
// Generate images when there are too many records
|
||||
true
|
||||
@@ -1945,29 +1952,45 @@ impl Timeline {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if let Some((_, _, val)) = replay_history.first() {
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
|
||||
|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
if replay_history.is_empty() && !key_exists {
|
||||
// The key does not exist at earlier LSN, we can skip this iteration.
|
||||
retention.push(Vec::new());
|
||||
continue;
|
||||
} else {
|
||||
key_exists = true;
|
||||
}
|
||||
if generate_image && records_since_last_image > 0 {
|
||||
let Some((_, _, val)) = replay_history.first() else {
|
||||
unreachable!("replay history should not be empty once it exists")
|
||||
};
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
});
|
||||
}
|
||||
// Whether to reconstruct the image. In debug mode, we will generate an image
|
||||
// at every retain_lsn to ensure data is not corrupted, but we won't put the
|
||||
// image into the final layer.
|
||||
let generate_image = produce_image || debug_mode;
|
||||
if produce_image {
|
||||
records_since_last_image = 0;
|
||||
let replay_history_for_debug = if cfg!(debug_assertions) {
|
||||
}
|
||||
let img_and_lsn = if generate_image {
|
||||
let replay_history_for_debug = if debug_mode {
|
||||
Some(replay_history.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let history = if produce_image {
|
||||
std::mem::take(&mut replay_history)
|
||||
} else {
|
||||
replay_history.clone()
|
||||
};
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
|
||||
@@ -2004,8 +2027,20 @@ impl Timeline {
|
||||
}
|
||||
records.reverse();
|
||||
let state = ValueReconstructState { img, records };
|
||||
let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
|
||||
// last batch does not generate image so i is always in range, unless we force generate
|
||||
// an image during testing
|
||||
let request_lsn = if i >= lsn_split_points.len() {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
lsn_split_points[i]
|
||||
};
|
||||
let img = self.reconstruct_value(key, request_lsn, state).await?;
|
||||
Some((request_lsn, img))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if produce_image {
|
||||
let (request_lsn, img) = img_and_lsn.unwrap();
|
||||
replay_history.push((key, request_lsn, Value::Image(img.clone())));
|
||||
retention.push(vec![(request_lsn, Value::Image(img))]);
|
||||
} else {
|
||||
@@ -2111,12 +2146,7 @@ impl Timeline {
|
||||
let mut compact_jobs = Vec::new();
|
||||
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
|
||||
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
|
||||
let ((dense_ks, sparse_ks), _) = {
|
||||
let Ok(partition) = self.partitioning.try_lock() else {
|
||||
bail!("failed to acquire partition lock during gc-compaction");
|
||||
};
|
||||
partition.clone()
|
||||
};
|
||||
let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
|
||||
// Truncate the key range to be within user specified compaction range.
|
||||
fn truncate_to(
|
||||
source_start: &Key,
|
||||
@@ -2273,6 +2303,8 @@ impl Timeline {
|
||||
let compact_key_range = job.compact_key_range;
|
||||
let compact_lsn_range = job.compact_lsn_range;
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
|
||||
|
||||
scopeguard::defer! {
|
||||
@@ -2398,7 +2430,7 @@ impl Timeline {
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(job_desc.gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
if debug_mode {
|
||||
assert_eq!(
|
||||
res,
|
||||
job_desc
|
||||
|
||||
@@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
/// For more context see comments in [`make_timeline_delete_guard`]
|
||||
async fn remove_maybe_offloaded_timeline_from_tenant(
|
||||
tenant: &Tenant,
|
||||
timeline: &TimelineOrOffloaded,
|
||||
@@ -193,10 +193,8 @@ impl DeleteTimelineFlow {
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let allow_offloaded_children = false;
|
||||
let set_stopping = true;
|
||||
let (timeline, mut guard) =
|
||||
Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
|
||||
make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?;
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -333,75 +331,6 @@ impl DeleteTimelineFlow {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) fn prepare(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
allow_offloaded_children: bool,
|
||||
set_stopping: bool,
|
||||
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
|
||||
// Note the interaction between this guard and deletion guard.
|
||||
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
|
||||
// This is important because when you take into account `remove_timeline_from_tenant`
|
||||
// we remove timeline from memory when we still hold the deletion guard.
|
||||
// So here when timeline deletion is finished timeline wont be present in timelines map at all
|
||||
// which makes the following sequence impossible:
|
||||
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
|
||||
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
|
||||
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
|
||||
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
|
||||
None => match timelines_offloaded.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
},
|
||||
};
|
||||
|
||||
// Ensure that there are no child timelines, because we are about to remove files,
|
||||
// which will break child branches
|
||||
let mut children = Vec::new();
|
||||
if !allow_offloaded_children {
|
||||
children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
|
||||
(entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
}
|
||||
children.extend(timelines.iter().filter_map(|(id, entry)| {
|
||||
(entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
|
||||
if !children.is_empty() {
|
||||
return Err(DeleteTimelineError::HasChildren(children));
|
||||
}
|
||||
|
||||
// Note that using try_lock here is important to avoid a deadlock.
|
||||
// Here we take lock on timelines and then the deletion guard.
|
||||
// At the end of the operation we're holding the guard and need to lock timelines map
|
||||
// to remove the timeline from it.
|
||||
// Always if you have two locks that are taken in different order this can result in a deadlock.
|
||||
|
||||
let delete_progress = Arc::clone(timeline.delete_progress());
|
||||
let delete_lock_guard = match delete_progress.try_lock_owned() {
|
||||
Ok(guard) => DeletionGuard(guard),
|
||||
Err(_) => {
|
||||
// Unfortunately if lock fails arc is consumed.
|
||||
return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
|
||||
timeline.delete_progress(),
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
if set_stopping {
|
||||
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((timeline, delete_lock_guard))
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: DeletionGuard,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -483,6 +412,80 @@ impl DeleteTimelineFlow {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
pub(super) enum TimelineDeleteGuardKind {
|
||||
Offload,
|
||||
Delete,
|
||||
}
|
||||
|
||||
pub(super) fn make_timeline_delete_guard(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
guard_kind: TimelineDeleteGuardKind,
|
||||
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
|
||||
// Note the interaction between this guard and deletion guard.
|
||||
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
|
||||
// This is important because when you take into account `remove_timeline_from_tenant`
|
||||
// we remove timeline from memory when we still hold the deletion guard.
|
||||
// So here when timeline deletion is finished timeline wont be present in timelines map at all
|
||||
// which makes the following sequence impossible:
|
||||
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
|
||||
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
|
||||
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
|
||||
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
|
||||
None => match timelines_offloaded.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
},
|
||||
};
|
||||
|
||||
// Ensure that there are no child timelines, because we are about to remove files,
|
||||
// which will break child branches
|
||||
let mut children = Vec::new();
|
||||
if guard_kind == TimelineDeleteGuardKind::Delete {
|
||||
children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
|
||||
(entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
}
|
||||
children.extend(timelines.iter().filter_map(|(id, entry)| {
|
||||
(entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
|
||||
if !children.is_empty() {
|
||||
return Err(DeleteTimelineError::HasChildren(children));
|
||||
}
|
||||
|
||||
// Note that using try_lock here is important to avoid a deadlock.
|
||||
// Here we take lock on timelines and then the deletion guard.
|
||||
// At the end of the operation we're holding the guard and need to lock timelines map
|
||||
// to remove the timeline from it.
|
||||
// Always if you have two locks that are taken in different order this can result in a deadlock.
|
||||
|
||||
let delete_progress = Arc::clone(timeline.delete_progress());
|
||||
let delete_lock_guard = match delete_progress.try_lock_owned() {
|
||||
Ok(guard) => DeletionGuard(guard),
|
||||
Err(_) => {
|
||||
// Unfortunately if lock fails arc is consumed.
|
||||
return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
|
||||
timeline.delete_progress(),
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
if guard_kind == TimelineDeleteGuardKind::Delete {
|
||||
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((timeline, delete_lock_guard))
|
||||
}
|
||||
|
||||
pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
|
||||
|
||||
impl Deref for DeletionGuard {
|
||||
|
||||
@@ -32,54 +32,151 @@
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! ## Data Structures
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
|
||||
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
|
||||
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
|
||||
//!
|
||||
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
|
||||
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
|
||||
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
|
||||
//!
|
||||
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
|
||||
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
|
||||
//! The `HandleInner` is allocated as a `Arc<Mutex<HandleInner>>` and
|
||||
//! referenced weakly and strongly from various places which we are now illustrating.
|
||||
//! For brevity, we will omit the `Arc<Mutex<>>` part in the following and instead
|
||||
//! use `strong ref` and `weak ref` when referring to the `Arc<Mutex<HandleInner>>`
|
||||
//! or `Weak<Mutex<HandleInner>>`, respectively.
|
||||
//!
|
||||
//! - The `Handle` is a strong ref.
|
||||
//! - The `WeakHandle` is a weak ref.
|
||||
//! - The `PerTimelineState` contains a `HashMap<CacheId, strong ref>`.
|
||||
//! - The `Cache` is a `HashMap<unique identifier for the shard, weak ref>`.
|
||||
//!
|
||||
//! Lifetimes:
|
||||
//! - `WeakHandle` and `Handle`: single pagestream request.
|
||||
//! - `Cache`: single page service connection.
|
||||
//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`).
|
||||
//!
|
||||
//! ## Request Handling Flow (= filling and using the `Cache``)
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
|
||||
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
|
||||
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
|
||||
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
|
||||
//! and a strong ref in the `PerTimelineState`.
|
||||
//! A strong ref is returned wrapped in a `Handle`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the `Weak<HandleInner>` in the cache.
|
||||
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
|
||||
//! and find the weak ref in the cache.
|
||||
//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`.
|
||||
//!
|
||||
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! The pagestream processing is pipelined and involves a batching step.
|
||||
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
|
||||
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
|
||||
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//!
|
||||
//! # Memory Management / How The Reference Cycle Is Broken
|
||||
//! # Performance
|
||||
//!
|
||||
//! The attentive reader may have noticed the strong reference cycle
|
||||
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
|
||||
//! Remember from the introductory section:
|
||||
//!
|
||||
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
|
||||
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
|
||||
//! > However, we want to avoid the overhead of entering the gate for every
|
||||
//! > method invocation.
|
||||
//!
|
||||
//! Why do we want to avoid that?
|
||||
//! Because the gate is a shared location in memory and entering it involves
|
||||
//! bumping refcounts, which leads to cache contention if done frequently
|
||||
//! from multiple cores in parallel.
|
||||
//!
|
||||
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
|
||||
//! That `Arc` is private to the `HandleInner` and hence to the connection.
|
||||
//! (Review the "Data Structures" section if that is unclear to you.)
|
||||
//!
|
||||
//! A `WeakHandle` is a weak ref to the `HandleInner`.
|
||||
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
|
||||
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
|
||||
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
|
||||
//!
|
||||
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
|
||||
//! Again, this is cheap because the `Arc` is private to the connection.
|
||||
//!
|
||||
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
|
||||
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
|
||||
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
|
||||
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
|
||||
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
|
||||
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
|
||||
//!
|
||||
//! # Shutdown
|
||||
//!
|
||||
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! Further, there is this cycle:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! The former cycle is a memory leak if not broken.
|
||||
//! The latter cycle further prevents the Timeline from shutting down
|
||||
//! because we certainly won't drop the Timeline while the GateGuard is alive.
|
||||
//! Preventing shutdown is the whole point of this handle/cache system,
|
||||
//! but when the Timeline needs to shut down, we need to break the cycle.
|
||||
//!
|
||||
//! The cycle is broken by either
|
||||
//! - `PerTimelineState::shutdown` or
|
||||
//! - dropping the `Cache`.
|
||||
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
|
||||
//! - Connection shutdown (=> dropping the `Cache`).
|
||||
//!
|
||||
//! Concurrently existing `Handle`s will extend the existence of the cycle.
|
||||
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
|
||||
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
|
||||
//! `Arc<GateGuard>`.
|
||||
//!
|
||||
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
|
||||
//! thereby breaking the cycle.
|
||||
//! It also initiates draining of already existing `Handle`s by
|
||||
//! poisoning things so that no new `HandleInner`'s can be added
|
||||
//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail.
|
||||
//!
|
||||
//! Concurrently existing / already upgraded `Handle`s will extend the
|
||||
//! lifetime of the `Arc<Mutex<HandleInner>>` and hence cycles.
|
||||
//! However, since `Handle`s are short-lived and new `Handle`s are not
|
||||
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
|
||||
//! that extension of the cycle is bounded.
|
||||
//! handed out from `Cache::get` or `WeakHandle::upgrade` after
|
||||
//! `PerTimelineState::shutdown`, that extension of the cycle is bounded.
|
||||
//!
|
||||
//! Concurrently existing `WeakHandle`s will fail to `upgrade()`:
|
||||
//! while they will succeed in upgrading `Weak<Mutex<HandleInner>>`,
|
||||
//! they will find the inner in state `HandleInner::ShutDown` state where the
|
||||
//! `Arc<GateGuard>` and Timeline has already been dropped.
|
||||
//!
|
||||
//! Dropping the `Cache` undoes the registration of this `Cache`'s
|
||||
//! `HandleInner`s from all the `PerTimelineState`s, i.e., it
|
||||
//! removes the strong ref to each of its `HandleInner`s
|
||||
//! from all the `PerTimelineState`.
|
||||
//!
|
||||
//! # Locking Rules
|
||||
//!
|
||||
//! To prevent deadlocks we:
|
||||
//!
|
||||
//! 1. Only ever hold one of the locks at a time.
|
||||
//! 2. Don't add more than one Drop impl that locks on the
|
||||
//! cycles above.
|
||||
//!
|
||||
//! As per (2), that impl is in `Drop for Cache`.
|
||||
//!
|
||||
//! # Fast Path for Shard Routing
|
||||
//!
|
||||
//! The `Cache` has a fast path for shard routing to avoid calling into
|
||||
//! the tenant manager for every request.
|
||||
//!
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s.
|
||||
//!
|
||||
//! The current implementation uses the first entry in the hash map
|
||||
//! to determine the `ShardParameters` and derive the correct
|
||||
@@ -87,18 +184,18 @@
|
||||
//!
|
||||
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
|
||||
//!
|
||||
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
|
||||
//! If the lookup is successful and the `WeakHandle` can be upgraded,
|
||||
//! it's a hit.
|
||||
//!
|
||||
//! ## Cache invalidation
|
||||
//!
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently if done lazily.
|
||||
//! The only reasons why an entry in the cache can become stale are:
|
||||
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
|
||||
//! being detached, timeline or shard deleted, or pageserver is shutting down.
|
||||
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
|
||||
//!
|
||||
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
|
||||
//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the
|
||||
//! timeline has shut down, and when that happens, we remove the entry from the cache.
|
||||
//!
|
||||
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
|
||||
@@ -107,8 +204,6 @@
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
@@ -152,7 +247,7 @@ pub(crate) struct Cache<T: Types> {
|
||||
map: Map<T>,
|
||||
}
|
||||
|
||||
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
|
||||
type Map<T> = HashMap<ShardTimelineId, WeakHandle<T>>;
|
||||
|
||||
impl<T: Types> Default for Cache<T> {
|
||||
fn default() -> Self {
|
||||
@@ -170,12 +265,22 @@ pub(crate) struct ShardTimelineId {
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
|
||||
struct HandleInner<T: Types> {
|
||||
shut_down: AtomicBool,
|
||||
timeline: T::Timeline,
|
||||
// The timeline's gate held open.
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
pub(crate) struct Handle<T: Types> {
|
||||
timeline: Arc<T::Timeline>,
|
||||
#[allow(dead_code)] // the field exists to keep the gate open
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
inner: Arc<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
pub(crate) struct WeakHandle<T: Types> {
|
||||
inner: Weak<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
enum HandleInner<T: Types> {
|
||||
KeepingTimelineGateOpen {
|
||||
#[allow(dead_code)]
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
timeline: Arc<T::Timeline>,
|
||||
},
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
|
||||
@@ -183,7 +288,8 @@ struct HandleInner<T: Types> {
|
||||
/// See module-level comment for details.
|
||||
pub struct PerTimelineState<T: Types> {
|
||||
// None = shutting down
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
|
||||
#[allow(clippy::type_complexity)]
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<Mutex<HandleInner<T>>>>>>,
|
||||
}
|
||||
|
||||
impl<T: Types> Default for PerTimelineState<T> {
|
||||
@@ -243,49 +349,24 @@ impl<T: Types> Cache<T> {
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
// terminates because each iteration removes an element from the map
|
||||
loop {
|
||||
let handle = self
|
||||
.get_impl(timeline_id, shard_selector, tenant_manager)
|
||||
.await?;
|
||||
if handle.0.shut_down.load(Ordering::Relaxed) {
|
||||
let removed = self
|
||||
.map
|
||||
.remove(&handle.0.timeline.shard_timeline_id())
|
||||
.expect("invariant of get_impl is that the returned handle is in the map");
|
||||
assert!(
|
||||
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
|
||||
"shard_timeline_id() incorrect?"
|
||||
);
|
||||
} else {
|
||||
return Ok(handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
async fn get_impl(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
let miss: ShardSelector = {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let miss: ShardSelector = loop {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
match routing_state {
|
||||
RoutingResult::FastPath(handle) => return Ok(handle),
|
||||
RoutingResult::SlowPath(key) => match self.map.get(&key) {
|
||||
Some(cached) => match cached.upgrade() {
|
||||
Some(upgraded) => return Ok(Handle(upgraded)),
|
||||
None => {
|
||||
Ok(upgraded) => return Ok(upgraded),
|
||||
Err(HandleUpgradeError::ShutDown) => {
|
||||
// TODO: dedup with shard_routing()
|
||||
trace!("handle cache stale");
|
||||
self.map.remove(&key).unwrap();
|
||||
ShardSelector::Known(key.shard_index)
|
||||
continue;
|
||||
}
|
||||
},
|
||||
None => ShardSelector::Known(key.shard_index),
|
||||
None => break ShardSelector::Known(key.shard_index),
|
||||
},
|
||||
RoutingResult::NeedConsultTenantManager => shard_selector,
|
||||
RoutingResult::NeedConsultTenantManager => break shard_selector,
|
||||
}
|
||||
};
|
||||
self.get_miss(timeline_id, miss, tenant_manager).await
|
||||
@@ -302,7 +383,7 @@ impl<T: Types> Cache<T> {
|
||||
let Some((first_key, first_handle)) = self.map.iter().next() else {
|
||||
return RoutingResult::NeedConsultTenantManager;
|
||||
};
|
||||
let Some(first_handle) = first_handle.upgrade() else {
|
||||
let Ok(first_handle) = first_handle.upgrade() else {
|
||||
// TODO: dedup with get()
|
||||
trace!("handle cache stale");
|
||||
let first_key_owned = *first_key;
|
||||
@@ -310,7 +391,7 @@ impl<T: Types> Cache<T> {
|
||||
continue;
|
||||
};
|
||||
|
||||
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
|
||||
let first_handle_shard_identity = first_handle.get_shard_identity();
|
||||
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
|
||||
shard_number: shard_num,
|
||||
shard_count: first_handle_shard_identity.count,
|
||||
@@ -329,11 +410,11 @@ impl<T: Types> Cache<T> {
|
||||
};
|
||||
let first_handle_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: first_handle_shard_identity.shard_index(),
|
||||
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
|
||||
timeline_id: first_handle.shard_timeline_id().timeline_id,
|
||||
};
|
||||
|
||||
if need_shard_timeline_id == first_handle_shard_timeline_id {
|
||||
return RoutingResult::FastPath(Handle(first_handle));
|
||||
return RoutingResult::FastPath(first_handle);
|
||||
} else {
|
||||
return RoutingResult::SlowPath(need_shard_timeline_id);
|
||||
}
|
||||
@@ -357,23 +438,30 @@ impl<T: Types> Cache<T> {
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
let gate_guard = match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
};
|
||||
trace!("creating new HandleInner");
|
||||
let handle = Arc::new(
|
||||
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
|
||||
// so we can identify reference cycle bugs.
|
||||
HandleInner {
|
||||
shut_down: AtomicBool::new(false),
|
||||
_gate_guard: gate_guard,
|
||||
timeline: timeline.clone(),
|
||||
},
|
||||
);
|
||||
let handle = {
|
||||
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
|
||||
gate_guard: Arc::new(
|
||||
// this enter() is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::gate refcounts
|
||||
match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
},
|
||||
),
|
||||
// this clone is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::clone refcounts
|
||||
timeline: Arc::new(timeline.clone()),
|
||||
}));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
let handle = handle_weak
|
||||
.upgrade()
|
||||
.ok()
|
||||
.expect("we just created it and it's not linked anywhere yet");
|
||||
{
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
@@ -381,7 +469,8 @@ impl<T: Types> Cache<T> {
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
|
||||
let replaced =
|
||||
per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
@@ -392,8 +481,7 @@ impl<T: Types> Cache<T> {
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(Arc::downgrade(&handle));
|
||||
handle
|
||||
v.insert(handle_weak);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -401,14 +489,62 @@ impl<T: Types> Cache<T> {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Handle(handle))
|
||||
}
|
||||
Ok(handle)
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum HandleUpgradeError {
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
impl<T: Types> WeakHandle<T> {
|
||||
pub(crate) fn upgrade(&self) -> Result<Handle<T>, HandleUpgradeError> {
|
||||
let Some(inner) = Weak::upgrade(&self.inner) else {
|
||||
return Err(HandleUpgradeError::ShutDown);
|
||||
};
|
||||
let lock_guard = inner.lock().expect("poisoned");
|
||||
match &*lock_guard {
|
||||
HandleInner::KeepingTimelineGateOpen {
|
||||
timeline,
|
||||
gate_guard,
|
||||
} => {
|
||||
let gate_guard = Arc::clone(gate_guard);
|
||||
let timeline = Arc::clone(timeline);
|
||||
drop(lock_guard);
|
||||
Ok(Handle {
|
||||
timeline,
|
||||
gate_guard,
|
||||
inner,
|
||||
})
|
||||
}
|
||||
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_same_handle_as(&self, other: &WeakHandle<T>) -> bool {
|
||||
Weak::ptr_eq(&self.inner, &other.inner)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> Handle<T> {
|
||||
pub(crate) fn downgrade(&self) -> WeakHandle<T> {
|
||||
WeakHandle {
|
||||
inner: Arc::downgrade(&self.inner),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> PerTimelineState<T> {
|
||||
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
@@ -430,43 +566,54 @@ impl<T: Types> PerTimelineState<T> {
|
||||
trace!("already shut down");
|
||||
return;
|
||||
};
|
||||
for handle in handles.values() {
|
||||
for handle_inner_arc in handles.values() {
|
||||
// Make hits fail.
|
||||
handle.shut_down.store(true, Ordering::Relaxed);
|
||||
let mut lock_guard = handle_inner_arc.lock().expect("poisoned");
|
||||
lock_guard.shutdown();
|
||||
}
|
||||
drop(handles);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<T: Types> Drop for HandleInner<T> {
|
||||
fn drop(&mut self) {
|
||||
trace!("HandleInner dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
|
||||
impl<T: Types> Drop for Cache<T> {
|
||||
fn drop(&mut self) {
|
||||
for (_, weak) in self.map.drain() {
|
||||
if let Some(strong) = weak.upgrade() {
|
||||
// handle is still being kept alive in PerTimelineState
|
||||
let timeline = strong.timeline.per_timeline_state();
|
||||
let mut handles = timeline.handles.lock().expect("mutex poisoned");
|
||||
if let Some(handles) = &mut *handles {
|
||||
let Some(removed) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
assert!(Arc::ptr_eq(&removed, &strong));
|
||||
}
|
||||
for (
|
||||
_,
|
||||
WeakHandle {
|
||||
inner: handle_inner_weak,
|
||||
},
|
||||
) in self.map.drain()
|
||||
{
|
||||
let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
|
||||
continue;
|
||||
};
|
||||
let handle_timeline = handle_inner_arc
|
||||
// locking rules: drop lock before acquiring other lock below
|
||||
.lock()
|
||||
.expect("poisoned")
|
||||
.shutdown();
|
||||
let per_timeline_state = handle_timeline.per_timeline_state();
|
||||
let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
|
||||
let Some(handles) = &mut *handles_lock_guard else {
|
||||
continue;
|
||||
};
|
||||
let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
drop(handles_lock_guard); // locking rules: remember them when!
|
||||
assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> HandleInner<T> {
|
||||
fn shutdown(&mut self) -> Arc<T::Timeline> {
|
||||
match std::mem::replace(self, HandleInner::ShutDown) {
|
||||
HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline,
|
||||
HandleInner::ShutDown => {
|
||||
unreachable!("handles are only shut down once in their lifetime");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -474,6 +621,8 @@ impl<T: Types> Drop for Cache<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Weak;
|
||||
|
||||
use pageserver_api::{
|
||||
key::{rel_block_to_key, Key, DBDIR_KEY},
|
||||
models::ShardParameters,
|
||||
@@ -583,39 +732,13 @@ mod tests {
|
||||
//
|
||||
// fill the cache
|
||||
//
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
let handle: Handle<_> = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
let handle_inner_weak = Arc::downgrade(&handle.0);
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
assert_eq!(
|
||||
(
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
Weak::weak_count(&handle_inner_weak)
|
||||
),
|
||||
(2, 2),
|
||||
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
|
||||
);
|
||||
assert_eq!(cache.map.len(), 1);
|
||||
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
//
|
||||
// demonstrate that Handle holds up gate closure
|
||||
@@ -640,21 +763,11 @@ mod tests {
|
||||
// SHUTDOWN
|
||||
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"through local var handle"
|
||||
);
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
1,
|
||||
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(via handle), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// this handle is perfectly usable
|
||||
handle.getpage();
|
||||
@@ -678,16 +791,6 @@ mod tests {
|
||||
}
|
||||
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
0,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"the HandleInner destructor already ran"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// closing gate succeeds after dropping handle
|
||||
tokio::select! {
|
||||
@@ -706,10 +809,8 @@ mod tests {
|
||||
assert_eq!(cache.map.len(), 0);
|
||||
|
||||
// ensure all refs to shard0 are gone and we're not leaking anything
|
||||
let myself = Weak::clone(&shard0.myself);
|
||||
drop(shard0);
|
||||
drop(mgr);
|
||||
assert_eq!(Weak::strong_count(&myself), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -948,15 +1049,11 @@ mod tests {
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.0));
|
||||
used_handles.push(Arc::downgrade(&handle.timeline));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown
|
||||
assert!(used_handles
|
||||
.iter()
|
||||
.all(|weak| Weak::strong_count(weak) == 0));
|
||||
|
||||
// ... thus the gate should close immediately, even without shutdown
|
||||
// No handles exist, thus gates are closed and don't require shutdown.
|
||||
// Thus the gate should close immediately, even without shutdown.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
@@ -964,4 +1061,172 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_weak_handles() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
|
||||
let refcount_start = Arc::strong_count(&shard0);
|
||||
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
|
||||
let weak_handle = handle.downgrade();
|
||||
|
||||
drop(handle);
|
||||
|
||||
let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it");
|
||||
|
||||
// Start shutdown
|
||||
shard0.per_timeline_state.shutdown();
|
||||
|
||||
// Upgrades during shutdown don't work, even if upgraded_handle exists.
|
||||
weak_handle
|
||||
.upgrade()
|
||||
.err()
|
||||
.expect("can't upgrade weak handle as soon as shutdown started");
|
||||
|
||||
// But upgraded_handle is still alive, so the gate won't close.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
|
||||
// Drop the last handle.
|
||||
drop(upgraded_handle);
|
||||
|
||||
// The gate should close now, despite there still being a weak_handle.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("only strong handle is dropped and we shut down per-timeline-state")
|
||||
}
|
||||
}
|
||||
|
||||
// The weak handle still can't be upgraded.
|
||||
weak_handle
|
||||
.upgrade()
|
||||
.err()
|
||||
.expect("still shouldn't be able to upgrade the weak handle");
|
||||
|
||||
// There should be no strong references to the timeline object except the one on "stack".
|
||||
assert_eq!(Arc::strong_count(&shard0), refcount_start);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_reference_cycle_broken_when_cache_is_dropped() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
// helper to check if a handle is referenced by per_timeline_state
|
||||
let per_timeline_state_refs_handle = |handle_weak: &Weak<Mutex<HandleInner<_>>>| {
|
||||
let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap();
|
||||
let per_timeline_state = per_timeline_state.as_ref().unwrap();
|
||||
per_timeline_state
|
||||
.values()
|
||||
.any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak))
|
||||
};
|
||||
|
||||
// Fill the cache.
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
let handle_inner_weak = Arc::downgrade(&handle.inner);
|
||||
assert!(
|
||||
per_timeline_state_refs_handle(&handle_inner_weak),
|
||||
"we still hold `handle` _and_ haven't dropped `cache` yet"
|
||||
);
|
||||
|
||||
// Drop the cache.
|
||||
drop(cache);
|
||||
|
||||
assert!(
|
||||
!(per_timeline_state_refs_handle(&handle_inner_weak)),
|
||||
"nothing should reference the handle allocation anymore"
|
||||
);
|
||||
assert!(
|
||||
Weak::upgrade(&handle_inner_weak).is_some(),
|
||||
"the local `handle` still keeps the allocation alive"
|
||||
);
|
||||
// but obviously the cache is gone so no new allocations can be handed out.
|
||||
|
||||
// Drop handle.
|
||||
drop(handle);
|
||||
assert!(
|
||||
Weak::upgrade(&handle_inner_weak).is_none(),
|
||||
"the local `handle` is dropped, so the allocation should be dropped by now"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
// grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails
|
||||
let handle_inner_weak = Arc::downgrade(&handle.inner);
|
||||
|
||||
// drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it
|
||||
drop(handle);
|
||||
assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still");
|
||||
|
||||
// Shutdown the per_timeline_state.
|
||||
shard0.per_timeline_state.shutdown();
|
||||
assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer");
|
||||
|
||||
// cache only contains Weak's, so, it can outlive the per_timeline_state without
|
||||
// Drop explicitly solely to make this point.
|
||||
drop(cache);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,10 +2,11 @@ use std::sync::Arc;
|
||||
|
||||
use pageserver_api::models::{TenantState, TimelineState};
|
||||
|
||||
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
|
||||
use super::delete::{delete_local_timeline_directory, DeletionGuard};
|
||||
use super::Timeline;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
|
||||
use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
|
||||
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -36,13 +37,10 @@ pub(crate) async fn offload_timeline(
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
tracing::info!("offloading archived timeline");
|
||||
|
||||
let allow_offloaded_children = true;
|
||||
let set_stopping = false;
|
||||
let (timeline, guard) = DeleteTimelineFlow::prepare(
|
||||
let (timeline, guard) = make_timeline_delete_guard(
|
||||
tenant,
|
||||
timeline.timeline_id,
|
||||
allow_offloaded_children,
|
||||
set_stopping,
|
||||
TimelineDeleteGuardKind::Offload,
|
||||
)
|
||||
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
|
||||
|
||||
@@ -106,7 +104,7 @@ pub(crate) async fn offload_timeline(
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
/// For more context see comments in [`make_timeline_delete_guard`]
|
||||
///
|
||||
/// Returns the strong count of the timeline `Arc`
|
||||
fn remove_timeline_from_tenant(
|
||||
|
||||
@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let (replication_client, connection) = {
|
||||
let mut config = wal_source_connconf.to_tokio_postgres_config();
|
||||
config.application_name("pageserver");
|
||||
config.application_name(format!("pageserver-{}", node.0).as_str());
|
||||
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
|
||||
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
|
||||
Ok(client_and_conn) => client_and_conn?,
|
||||
|
||||
@@ -913,27 +913,25 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* We have two choices if all cache pages are pinned (i.e. used in IO
|
||||
* operations):
|
||||
*
|
||||
* 1) Wait until some of this operation is completed and pages is
|
||||
* unpinned.
|
||||
*
|
||||
* 2) Allocate one more chunk, so that specified cache size is more
|
||||
* recommendation than hard limit.
|
||||
*
|
||||
* As far as probability of such event (that all pages are pinned) is
|
||||
* considered to be very very small: there are should be very large
|
||||
* number of concurrent IO operations and them are limited by
|
||||
* max_connections, we prefer not to complicate code and use second
|
||||
* approach.
|
||||
*/
|
||||
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
|
||||
if (lfc_ctl->used >= lfc_ctl->limit)
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
FileCacheEntry *victim;
|
||||
|
||||
if (dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/*
|
||||
* No unpinned entries are available: it should very rarely happen in real life
|
||||
* because maximal number of backends should be smaller than LFC size.
|
||||
* So just refuse to update LFC in this case.
|
||||
*/
|
||||
neon_log(LOG, "LFC overflow: no unpinned entries available");
|
||||
hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_REMOVE, NULL);
|
||||
LWLockRelease(lfc_lock);
|
||||
return;
|
||||
}
|
||||
victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
|
||||
|
||||
@@ -911,7 +911,74 @@ pageserver_receive(shardno_t shard_no)
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
|
||||
shard->nresponses_received++;
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
|
||||
static NeonResponse *
|
||||
pageserver_try_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
if (shard->state != PS_Connected)
|
||||
return NULL;
|
||||
|
||||
Assert(pageserver_conn);
|
||||
|
||||
rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */);
|
||||
|
||||
if (rc == 0)
|
||||
return NULL;
|
||||
else if (rc > 0)
|
||||
{
|
||||
PG_TRY();
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
@@ -980,6 +1047,7 @@ page_server_api api =
|
||||
.send = pageserver_send,
|
||||
.flush = pageserver_flush,
|
||||
.receive = pageserver_receive,
|
||||
.try_receive = pageserver_try_receive,
|
||||
.disconnect = pageserver_disconnect_shard
|
||||
};
|
||||
|
||||
|
||||
@@ -34,6 +34,8 @@ typedef enum
|
||||
T_NeonGetPageRequest,
|
||||
T_NeonDbSizeRequest,
|
||||
T_NeonGetSlruSegmentRequest,
|
||||
/* future tags above this line */
|
||||
T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */
|
||||
|
||||
/* pagestore -> pagestore_client */
|
||||
T_NeonExistsResponse = 100,
|
||||
@@ -42,6 +44,8 @@ typedef enum
|
||||
T_NeonErrorResponse,
|
||||
T_NeonDbSizeResponse,
|
||||
T_NeonGetSlruSegmentResponse,
|
||||
/* future tags above this line */
|
||||
T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */
|
||||
} NeonMessageTag;
|
||||
|
||||
typedef uint64 NeonRequestId;
|
||||
@@ -192,9 +196,29 @@ typedef uint16 shardno_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/*
|
||||
* Send this request to the PageServer associated with this shard.
|
||||
*/
|
||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||
/*
|
||||
* Blocking read for the next response of this shard.
|
||||
*
|
||||
* When a CANCEL signal is handled, the connection state will be
|
||||
* unmodified.
|
||||
*/
|
||||
NeonResponse *(*receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Try get the next response from the TCP buffers, if any.
|
||||
* Returns NULL when the data is not yet available.
|
||||
*/
|
||||
NeonResponse *(*try_receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Make sure all requests are sent to PageServer.
|
||||
*/
|
||||
bool (*flush) (shardno_t shard_no);
|
||||
/*
|
||||
* Disconnect from this pageserver shard.
|
||||
*/
|
||||
void (*disconnect) (shardno_t shard_no);
|
||||
} page_server_api;
|
||||
|
||||
|
||||
@@ -405,6 +405,56 @@ compact_prefetch_buffers(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there might be responses still in the TCP buffer, then
|
||||
* we should try to use those, so as to reduce any TCP backpressure
|
||||
* on the OS/PS side.
|
||||
*
|
||||
* This procedure handles that.
|
||||
*
|
||||
* Note that this is only valid as long as the only pipelined
|
||||
* operations in the TCP buffer are getPage@Lsn requests.
|
||||
*/
|
||||
static void
|
||||
prefetch_pump_state(void)
|
||||
{
|
||||
while (MyPState->ring_receive != MyPState->ring_flush)
|
||||
{
|
||||
NeonResponse *response;
|
||||
PrefetchRequest *slot;
|
||||
MemoryContext old;
|
||||
|
||||
slot = GetPrfSlot(MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = page_server->try_receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
|
||||
if (response == NULL)
|
||||
break;
|
||||
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
MyPState->ring_receive += 1;
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_RECEIVED;
|
||||
slot->response = response;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
readahead_buffer_resize(int newsize, void *extra)
|
||||
{
|
||||
@@ -2808,6 +2858,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
MyPState->ring_last <= ring_index);
|
||||
}
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2849,6 +2901,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
Assert(ring_index < MyPState->ring_unused &&
|
||||
MyPState->ring_last <= ring_index);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif /* PG_MAJORVERSION_NUM < 17 */
|
||||
@@ -2891,6 +2945,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
*/
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwriteback(reln, forknum, blocknum, nblocks);
|
||||
@@ -3145,6 +3201,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3282,6 +3340,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
|
||||
buffers, nblocks, read);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3450,6 +3510,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
@@ -3503,6 +3565,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
|
||||
@@ -3792,6 +3856,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdimmedsync(reln, forknum);
|
||||
|
||||
@@ -102,6 +102,11 @@ impl Client {
|
||||
self.get(&uri).await
|
||||
}
|
||||
|
||||
pub async fn utilization(&self) -> Result<reqwest::Response> {
|
||||
let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
|
||||
self.get(&uri).await
|
||||
}
|
||||
|
||||
async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
|
||||
self.request(Method::GET, uri, ()).await
|
||||
}
|
||||
|
||||
@@ -52,16 +52,70 @@ pub struct SafekeeperPostgresHandler {
|
||||
|
||||
/// Parsed Postgres command.
|
||||
enum SafekeeperPostgresCommand {
|
||||
StartWalPush,
|
||||
StartReplication { start_lsn: Lsn, term: Option<Term> },
|
||||
StartWalPush {
|
||||
proto_version: u32,
|
||||
// Eventually timelines will be always created explicitly by storcon.
|
||||
// This option allows legacy behaviour for compute to do that until we
|
||||
// fully migrate.
|
||||
allow_timeline_creation: bool,
|
||||
},
|
||||
StartReplication {
|
||||
start_lsn: Lsn,
|
||||
term: Option<Term>,
|
||||
},
|
||||
IdentifySystem,
|
||||
TimelineStatus,
|
||||
JSONCtrl { cmd: AppendLogicalMessage },
|
||||
JSONCtrl {
|
||||
cmd: AppendLogicalMessage,
|
||||
},
|
||||
}
|
||||
|
||||
fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
if cmd.starts_with("START_WAL_PUSH") {
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
// Allow additional options in postgres START_REPLICATION style like
|
||||
// START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false').
|
||||
// Parsing here is very naive and breaks in case of commas or
|
||||
// whitespaces in values, but enough for our purposes.
|
||||
let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap();
|
||||
let caps = re
|
||||
.captures(cmd)
|
||||
.context(format!("failed to parse START_WAL_PUSH command {}", cmd))?;
|
||||
// capture () content
|
||||
let options = caps.get(2).map(|m| m.as_str()).unwrap_or("");
|
||||
// default values
|
||||
let mut proto_version = 2;
|
||||
let mut allow_timeline_creation = true;
|
||||
for kvstr in options.split(",") {
|
||||
if kvstr.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let mut kvit = kvstr.split_whitespace();
|
||||
let key = kvit.next().context(format!(
|
||||
"failed to parse key in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value = kvit.next().context(format!(
|
||||
"failed to parse value in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value_trimmed = value.trim_matches('\'');
|
||||
if key == "proto_version" {
|
||||
proto_version = value_trimmed.parse::<u32>().context(format!(
|
||||
"failed to parse proto_version value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
if key == "allow_timeline_creation" {
|
||||
allow_timeline_creation = value_trimmed.parse::<bool>().context(format!(
|
||||
"failed to parse allow_timeline_creation value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
}
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
})
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
let re = Regex::new(
|
||||
// We follow postgres START_REPLICATION LOGICAL options to pass term.
|
||||
@@ -95,7 +149,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
|
||||
fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
|
||||
SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
|
||||
SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
|
||||
@@ -293,8 +347,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush => {
|
||||
self.handle_start_wal_push(pgb)
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation)
|
||||
.instrument(info_span!("WAL receiver"))
|
||||
.await
|
||||
}
|
||||
@@ -467,3 +524,39 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SafekeeperPostgresCommand;
|
||||
|
||||
/// Test parsing of START_WAL_PUSH command
|
||||
#[test]
|
||||
fn test_start_wal_push_parse() {
|
||||
let cmd = "START_WAL_PUSH";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 2);
|
||||
assert!(allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
|
||||
let cmd =
|
||||
"START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 3);
|
||||
assert!(!allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,6 +127,13 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let utilization = global_timelines.get_timeline_counts();
|
||||
json_response(StatusCode::OK, utilization)
|
||||
}
|
||||
|
||||
/// List all (not deleted) timelines.
|
||||
/// Note: it is possible to do the same with debug_dump.
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -620,6 +627,7 @@ pub fn make_router(
|
||||
failpoints_handler(r, cancel).await
|
||||
})
|
||||
})
|
||||
.get("/v1/uzilization", |r| request_span(r, utilization_handler))
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_handler)
|
||||
})
|
||||
|
||||
@@ -200,9 +200,14 @@ impl SafekeeperPostgresHandler {
|
||||
pub async fn handle_start_wal_push<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), QueryError> {
|
||||
let mut tli: Option<WalResidentTimeline> = None;
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
|
||||
if let Err(end) = self
|
||||
.handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation)
|
||||
.await
|
||||
{
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
let handle_end_fut = pgb.handle_copy_stream_end(end);
|
||||
// If we managed to create the timeline, augment logging with current LSNs etc.
|
||||
@@ -222,6 +227,8 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tli: &mut Option<WalResidentTimeline>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// The `tli` parameter is only used for passing _out_ a timeline, one should
|
||||
// not have been passed in.
|
||||
@@ -250,12 +257,17 @@ impl SafekeeperPostgresHandler {
|
||||
conn_id: self.conn_id,
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
proto_version,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
global_timelines: self.global_timelines.clone(),
|
||||
};
|
||||
|
||||
// Read first message and create timeline if needed.
|
||||
let res = network_reader.read_first_message().await;
|
||||
// Read first message and create timeline if needed and allowed. This
|
||||
// won't be when timelines will be always created by storcon and
|
||||
// allow_timeline_creation becomes false.
|
||||
let res = network_reader
|
||||
.read_first_message(allow_timeline_creation)
|
||||
.await;
|
||||
|
||||
let network_res = if let Ok((timeline, next_msg)) = res {
|
||||
let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
|
||||
@@ -313,6 +325,7 @@ struct NetworkReader<'a, IO> {
|
||||
conn_id: ConnectionId,
|
||||
pgb_reader: &'a mut PostgresBackendReader<IO>,
|
||||
peer_addr: SocketAddr,
|
||||
proto_version: u32,
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
@@ -322,9 +335,10 @@ struct NetworkReader<'a, IO> {
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
async fn read_first_message(
|
||||
&mut self,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let next_msg = read_message(self.pgb_reader, self.proto_version).await?;
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
@@ -336,17 +350,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.create(
|
||||
self.ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
let tli = if allow_timeline_creation {
|
||||
self.global_timelines
|
||||
.create(
|
||||
self.ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.context("create timeline")?
|
||||
} else {
|
||||
self.global_timelines
|
||||
.get(self.ttid)
|
||||
.context("get timeline")?
|
||||
};
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
_ => {
|
||||
@@ -375,7 +394,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
));
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg).await
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -383,9 +402,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
/// TODO: Return Ok(None) on graceful termination.
|
||||
async fn read_message<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
proto_version: u32,
|
||||
) -> Result<ProposerAcceptorMessage, CopyStreamHandlerEnd> {
|
||||
let copy_data = pgb_reader.read_copy_message().await?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?;
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
@@ -393,6 +413,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
mut next_msg: ProposerAcceptorMessage,
|
||||
proto_version: u32,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
/// Threshold for logging slow WalAcceptor sends.
|
||||
const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
|
||||
@@ -425,7 +446,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
|
||||
WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
|
||||
|
||||
next_msg = read_message(pgb_reader).await?;
|
||||
next_msg = read_message(pgb_reader, proto_version).await?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -317,7 +317,14 @@ pub enum ProposerAcceptorMessage {
|
||||
|
||||
impl ProposerAcceptorMessage {
|
||||
/// Parse proposer message.
|
||||
pub fn parse(msg_bytes: Bytes) -> Result<ProposerAcceptorMessage> {
|
||||
pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
|
||||
if proto_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"incompatible protocol version {}, expected {}",
|
||||
proto_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
|
||||
@@ -13,6 +13,7 @@ use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::SafekeeperUtilization;
|
||||
use safekeeper_api::ServerInfo;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -416,6 +417,20 @@ impl GlobalTimelines {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Returns statistics about timeline counts
|
||||
pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
|
||||
let global_lock = self.state.lock().unwrap();
|
||||
let timeline_count = global_lock
|
||||
.timelines
|
||||
.values()
|
||||
.filter(|t| match t {
|
||||
GlobalMapTimeline::CreationInProgress => false,
|
||||
GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
|
||||
})
|
||||
.count() as u64;
|
||||
SafekeeperUtilization { timeline_count }
|
||||
}
|
||||
|
||||
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
|
||||
/// and that's why it can return cancelled timelines, to retry deleting them.
|
||||
fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
|
||||
|
||||
@@ -15,7 +15,9 @@ use desim::{
|
||||
};
|
||||
use http::Uri;
|
||||
use safekeeper::{
|
||||
safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
|
||||
safekeeper::{
|
||||
ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
|
||||
},
|
||||
state::{TimelinePersistentState, TimelineState},
|
||||
timeline::TimelineError,
|
||||
wal_storage::Storage,
|
||||
@@ -285,7 +287,7 @@ impl ConnState {
|
||||
bail!("finished processing START_REPLICATION")
|
||||
}
|
||||
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
|
||||
debug!("got msg: {:?}", msg);
|
||||
self.process(msg, global)
|
||||
} else {
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled';
|
||||
UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause';
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause';
|
||||
UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled';
|
||||
@@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::controller_api::{
|
||||
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
|
||||
@@ -1305,6 +1305,35 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Sets the scheduling policy of the specified safekeeper
|
||||
async fn handle_safekeeper_scheduling_policy(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let body = json_request::<SafekeeperSchedulingPolicyRequest>(&mut req).await?;
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
state
|
||||
.service
|
||||
.set_safekeeper_scheduling_policy(id, body.scheduling_policy)
|
||||
.await?;
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::empty())
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
@@ -1873,7 +1902,18 @@ pub fn make_router(
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
named_request_span(
|
||||
r,
|
||||
handle_upsert_safekeeper,
|
||||
RequestName("v1_safekeeper_post"),
|
||||
)
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id/scheduling_policy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_safekeeper_scheduling_policy,
|
||||
RequestName("v1_safekeeper_status"),
|
||||
)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
|
||||
@@ -1104,6 +1104,37 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id_: i64,
|
||||
scheduling_policy_: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
#[derive(Insertable, AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
struct UpdateSkSchedulingPolicy<'a> {
|
||||
id: i64,
|
||||
scheduling_policy: &'a str,
|
||||
}
|
||||
let scheduling_policy_ = String::from(scheduling_policy_);
|
||||
|
||||
let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
|
||||
.set(scheduling_policy.eq(scheduling_policy_))
|
||||
.execute(conn)?;
|
||||
|
||||
if rows_affected != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of rows ({rows_affected})",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
|
||||
@@ -47,7 +47,7 @@ use pageserver_api::{
|
||||
AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
|
||||
NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
|
||||
ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
|
||||
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
|
||||
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
@@ -5411,6 +5411,15 @@ impl Service {
|
||||
|
||||
expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
|
||||
|
||||
// Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig`
|
||||
// definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new
|
||||
// fields are added, before doing a comparison.
|
||||
for tsp in &mut persistent_shards {
|
||||
let config: TenantConfig = serde_json::from_str(&tsp.config)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible");
|
||||
}
|
||||
|
||||
if persistent_shards != expect_shards {
|
||||
tracing::error!("Consistency check failed on shards.");
|
||||
|
||||
@@ -7270,19 +7279,14 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
|
||||
/// 1. The node should be filled until it reaches the expected cluster average of
|
||||
/// attached shards. If there are not enough secondaries on the node, the plan stops early.
|
||||
/// 2. Select tenant shards to promote such that the number of attached shards is balanced
|
||||
/// throughout the cluster. We achieve this by picking tenant shards from each node,
|
||||
/// starting from the ones with the largest number of attached shards, until the node
|
||||
/// reaches the expected cluster average.
|
||||
/// 3. Avoid promoting more shards of the same tenant than required. The upper bound
|
||||
/// for the number of tenants from the same shard promoted to the node being filled is:
|
||||
/// shard count for the tenant divided by the number of nodes in the cluster.
|
||||
/// Create a node fill plan (pick secondaries to promote), based on:
|
||||
/// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
|
||||
/// outside their home AZ, should be migrated back here.
|
||||
/// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of
|
||||
/// attached shards, we will promote more shards from the nodes with the most attached shards, unless
|
||||
/// those shards have a home AZ that doesn't match the node we're filling.
|
||||
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
|
||||
let node_az = nodes
|
||||
@@ -7291,53 +7295,79 @@ impl Service {
|
||||
.get_availability_zone_id()
|
||||
.clone();
|
||||
|
||||
let mut tids_by_node = tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(tid, tenant_shard)| {
|
||||
if !matches!(
|
||||
tenant_shard.get_scheduling_policy(),
|
||||
ShardSchedulingPolicy::Active
|
||||
) {
|
||||
// Only include tenants in fills if they have a normal (Active) scheduling policy. We
|
||||
// even exclude Essential, because moving to fill a node is not essential to keeping this
|
||||
// tenant available.
|
||||
return None;
|
||||
}
|
||||
// The tenant shard IDs that we plan to promote from secondary to attached on this node
|
||||
let mut plan = Vec::new();
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved.
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it is
|
||||
// elegible to be moved: fall through;
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling:
|
||||
// do not include it in the fill plan.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
// Collect shards which do not have a preferred AZ & are elegible for moving in stage 2
|
||||
let mut free_tids_by_node: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
|
||||
if tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
// Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could
|
||||
// conceivably come up in real life if deploying a single-AZ region intentionally.
|
||||
let respect_azs = nodes
|
||||
.values()
|
||||
.map(|n| n.get_availability_zone_id())
|
||||
.unique()
|
||||
.count()
|
||||
> 1;
|
||||
|
||||
// Step 1: collect all shards that we are required to migrate back to this node because their AZ preference
|
||||
// requires it.
|
||||
for (tsid, tenant_shard) in tenants {
|
||||
if !tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
// Shard doesn't have a secondary on this node, ignore it.
|
||||
continue;
|
||||
}
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
_ if !respect_azs => {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
return Some((*primary, *tid));
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved, but we
|
||||
// will only do so if our target shard count requires it.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it should
|
||||
// be moved back to this node as part of filling, unless its currently
|
||||
// attached location is also in its home AZ.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
if nodes
|
||||
.get(primary)
|
||||
.expect("referenced node must exist")
|
||||
.get_availability_zone_id()
|
||||
!= tenant_shard
|
||||
.preferred_az()
|
||||
.expect("tenant must have an AZ preference")
|
||||
{
|
||||
plan.push(*tsid)
|
||||
}
|
||||
} else {
|
||||
plan.push(*tsid)
|
||||
}
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling,
|
||||
// it may not be moved back to this node as part of filling. Ignore it
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
})
|
||||
.into_group_map();
|
||||
// Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
|
||||
let expected_attached = locked.scheduler.expected_attached_shard_count();
|
||||
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
|
||||
|
||||
let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
|
||||
let mut plan = Vec::new();
|
||||
|
||||
for (node_id, attached) in nodes_by_load {
|
||||
let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
|
||||
@@ -7346,7 +7376,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if plan.len() >= fill_requirement
|
||||
|| tids_by_node.is_empty()
|
||||
|| free_tids_by_node.is_empty()
|
||||
|| attached <= expected_attached
|
||||
{
|
||||
break;
|
||||
@@ -7358,7 +7388,7 @@ impl Service {
|
||||
|
||||
let mut remove_node = false;
|
||||
while take > 0 {
|
||||
match tids_by_node.get_mut(&node_id) {
|
||||
match free_tids_by_node.get_mut(&node_id) {
|
||||
Some(tids) => match tids.pop() {
|
||||
Some(tid) => {
|
||||
let max_promote_for_tenant = std::cmp::max(
|
||||
@@ -7384,7 +7414,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if remove_node {
|
||||
tids_by_node.remove(&node_id);
|
||||
free_tids_by_node.remove(&node_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7651,6 +7681,16 @@ impl Service {
|
||||
self.persistence.safekeeper_upsert(record).await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id: i64,
|
||||
scheduling_policy: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
self.persistence
|
||||
.set_safekeeper_scheduling_policy(id, scheduling_policy)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn update_shards_preferred_azs(
|
||||
&self,
|
||||
req: ShardsPreferredAzsRequest,
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::NodeId;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use super::Service;
|
||||
use super::{Node, Scheduler, Service, TenantShard};
|
||||
|
||||
pub struct ChaosInjector {
|
||||
service: Arc<Service>,
|
||||
@@ -35,50 +41,86 @@ impl ChaosInjector {
|
||||
}
|
||||
}
|
||||
|
||||
/// If a shard has a secondary and attached location, then re-assign the secondary to be
|
||||
/// attached and the attached to be secondary.
|
||||
///
|
||||
/// Only modifies tenants if they're in Active scheduling policy.
|
||||
fn maybe_migrate_to_secondary(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
nodes: &Arc<HashMap<NodeId, Node>>,
|
||||
tenants: &mut BTreeMap<TenantShardId, TenantShard>,
|
||||
scheduler: &mut Scheduler,
|
||||
) {
|
||||
let shard = tenants
|
||||
.get_mut(&tenant_shard_id)
|
||||
.expect("Held lock between choosing ID and this get");
|
||||
|
||||
if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
|
||||
// Skip non-active scheduling policies, so that a shard with a policy like Pause can
|
||||
// be pinned without being disrupted by us.
|
||||
tracing::info!(
|
||||
"Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
|
||||
shard.get_scheduling_policy()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Pick a secondary to promote
|
||||
let Some(new_location) = shard
|
||||
.intent
|
||||
.get_secondary()
|
||||
.choose(&mut thread_rng())
|
||||
.cloned()
|
||||
else {
|
||||
tracing::info!(
|
||||
"Skipping shard {tenant_shard_id}: no secondary location, can't migrate"
|
||||
);
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(old_location) = *shard.intent.get_attached() else {
|
||||
tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location");
|
||||
return;
|
||||
};
|
||||
|
||||
tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}");
|
||||
|
||||
shard.intent.demote_attached(scheduler, old_location);
|
||||
shard.intent.promote_attached(scheduler, new_location);
|
||||
self.service.maybe_reconcile_shard(shard, nodes);
|
||||
}
|
||||
|
||||
async fn inject_chaos(&mut self) {
|
||||
// Pick some shards to interfere with
|
||||
let batch_size = 128;
|
||||
let mut inner = self.service.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = inner.parts_mut();
|
||||
let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
|
||||
let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
|
||||
|
||||
for victim in victims {
|
||||
let shard = tenants
|
||||
.get_mut(victim)
|
||||
.expect("Held lock between choosing ID and this get");
|
||||
|
||||
if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
|
||||
// Skip non-active scheduling policies, so that a shard with a policy like Pause can
|
||||
// be pinned without being disrupted by us.
|
||||
tracing::info!(
|
||||
"Skipping shard {victim}: scheduling policy is {:?}",
|
||||
shard.get_scheduling_policy()
|
||||
);
|
||||
continue;
|
||||
// Prefer to migrate tenants that are currently outside their home AZ. This avoids the chaos injector
|
||||
// continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
|
||||
// random tenants to move, and then on next chaos iteration moving them back, then picking some new
|
||||
// random tenants on the next iteration.
|
||||
let mut victims = Vec::with_capacity(batch_size);
|
||||
for shard in tenants.values() {
|
||||
if shard.is_attached_outside_preferred_az(nodes) {
|
||||
victims.push(shard.tenant_shard_id);
|
||||
}
|
||||
|
||||
// Pick a secondary to promote
|
||||
let Some(new_location) = shard
|
||||
.intent
|
||||
.get_secondary()
|
||||
.choose(&mut thread_rng())
|
||||
.cloned()
|
||||
else {
|
||||
tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
|
||||
continue;
|
||||
};
|
||||
if victims.len() >= batch_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let Some(old_location) = *shard.intent.get_attached() else {
|
||||
tracing::info!("Skipping shard {victim}: currently has no attached location");
|
||||
continue;
|
||||
};
|
||||
let choose_random = batch_size.saturating_sub(victims.len());
|
||||
tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
|
||||
|
||||
tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
|
||||
let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
|
||||
victims.extend(random_victims);
|
||||
|
||||
shard.intent.demote_attached(scheduler, old_location);
|
||||
shard.intent.promote_attached(scheduler, new_location);
|
||||
self.service.maybe_reconcile_shard(shard, nodes);
|
||||
for victim in victims {
|
||||
self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1793,6 +1793,23 @@ impl TenantShard {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the tenant shard is attached to a node that is outside the preferred AZ.
|
||||
///
|
||||
/// If the shard does not have a preferred AZ, returns false.
|
||||
pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap<NodeId, Node>) -> bool {
|
||||
self.intent
|
||||
.get_attached()
|
||||
.map(|node_id| {
|
||||
Some(
|
||||
nodes
|
||||
.get(&node_id)
|
||||
.expect("referenced node exists")
|
||||
.get_availability_zone_id(),
|
||||
) == self.intent.preferred_az_id.as_ref()
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TenantShard {
|
||||
|
||||
@@ -370,6 +370,7 @@ class NeonEnvBuilder:
|
||||
pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None,
|
||||
num_safekeepers: int = 1,
|
||||
num_pageservers: int = 1,
|
||||
num_azs: int = 1,
|
||||
# Use non-standard SK ids to check for various parsing bugs
|
||||
safekeepers_id_start: int = 0,
|
||||
# fsync is disabled by default to make the tests go faster
|
||||
@@ -401,6 +402,7 @@ class NeonEnvBuilder:
|
||||
self.pageserver_config_override = pageserver_config_override
|
||||
self.num_safekeepers = num_safekeepers
|
||||
self.num_pageservers = num_pageservers
|
||||
self.num_azs = num_azs
|
||||
self.safekeepers_id_start = safekeepers_id_start
|
||||
self.safekeepers_enable_fsync = safekeepers_enable_fsync
|
||||
self.auth_enabled = auth_enabled
|
||||
@@ -990,6 +992,7 @@ class NeonEnv:
|
||||
self.endpoints = EndpointFactory(self)
|
||||
self.safekeepers: list[Safekeeper] = []
|
||||
self.pageservers: list[NeonPageserver] = []
|
||||
self.num_azs = config.num_azs
|
||||
self.broker = NeonBroker(self)
|
||||
self.pageserver_remote_storage = config.pageserver_remote_storage
|
||||
self.safekeepers_remote_storage = config.safekeepers_remote_storage
|
||||
@@ -1090,14 +1093,21 @@ class NeonEnv:
|
||||
http=self.port_distributor.get_port(),
|
||||
)
|
||||
|
||||
# Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override`
|
||||
if self.num_azs > 1:
|
||||
# Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc.
|
||||
az_prefix = DEFAULT_AZ_ID[:-1]
|
||||
availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}"
|
||||
else:
|
||||
availability_zone = DEFAULT_AZ_ID
|
||||
|
||||
ps_cfg: dict[str, Any] = {
|
||||
"id": ps_id,
|
||||
"listen_pg_addr": f"localhost:{pageserver_port.pg}",
|
||||
"listen_http_addr": f"localhost:{pageserver_port.http}",
|
||||
"pg_auth_type": pg_auth_type,
|
||||
"http_auth_type": http_auth_type,
|
||||
# Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
|
||||
"availability_zone": DEFAULT_AZ_ID,
|
||||
"availability_zone": availability_zone,
|
||||
# Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
|
||||
# the pageserver taking a long time to start up due to syncfs flushing other tests' data
|
||||
"no_sync": True,
|
||||
@@ -2336,6 +2346,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
json=body,
|
||||
)
|
||||
|
||||
def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
json={"id": id, "scheduling_policy": scheduling_policy},
|
||||
)
|
||||
|
||||
def get_safekeeper(self, id: int) -> dict[str, Any] | None:
|
||||
try:
|
||||
response = self.request(
|
||||
@@ -4135,7 +4153,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
|
||||
# Checkpoints running endpoint and returns pg_wal size in MB.
|
||||
def get_pg_wal_size(self):
|
||||
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
|
||||
log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}")
|
||||
self.safe_psql("checkpoint")
|
||||
assert self.pgdata_dir is not None # please mypy
|
||||
return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
|
||||
@@ -4975,7 +4993,7 @@ def logical_replication_sync(
|
||||
if res:
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
subscriber_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
|
||||
if subscriber_lsn >= publisher_lsn:
|
||||
return subscriber_lsn
|
||||
time.sleep(0.5)
|
||||
|
||||
@@ -53,6 +53,22 @@ class Workload:
|
||||
self._endpoint: Endpoint | None = None
|
||||
self._endpoint_opts = endpoint_opts or {}
|
||||
|
||||
def branch(
|
||||
self,
|
||||
timeline_id: TimelineId,
|
||||
branch_name: str | None = None,
|
||||
endpoint_opts: dict[str, Any] | None = None,
|
||||
) -> Workload:
|
||||
"""
|
||||
Checkpoint the current status of the workload in case of branching
|
||||
"""
|
||||
branch_workload = Workload(
|
||||
self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts
|
||||
)
|
||||
branch_workload.expect_rows = self.expect_rows
|
||||
branch_workload.churn_cursor = self.churn_cursor
|
||||
return branch_workload
|
||||
|
||||
def reconfigure(self) -> None:
|
||||
"""
|
||||
Request the endpoint to reconfigure based on location reported by storage controller
|
||||
|
||||
@@ -112,7 +112,11 @@ page_cache_size=10
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize(
|
||||
"with_branches",
|
||||
["with_branches", "no_branches"],
|
||||
)
|
||||
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str):
|
||||
SMOKE_CONF = {
|
||||
# Run both gc and gc-compaction.
|
||||
"gc_period": "5s",
|
||||
@@ -143,12 +147,17 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
log.info("Writing initial data ...")
|
||||
workload.write_rows(row_count, env.pageserver.id)
|
||||
|
||||
child_workloads: list[Workload] = []
|
||||
|
||||
for i in range(1, churn_rounds + 1):
|
||||
if i % 10 == 0:
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
|
||||
if (i - 1) % 10 == 0:
|
||||
# Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
|
||||
if i % 10 == 5 and with_branches == "with_branches":
|
||||
branch_name = f"child-{i}"
|
||||
branch_timeline_id = env.create_branch(branch_name)
|
||||
child_workloads.append(workload.branch(branch_timeline_id, branch_name))
|
||||
if (i - 1) % 10 == 0 or (i - 1) % 10 == 1:
|
||||
# Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time.
|
||||
ps_http.timeline_compact(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -179,6 +188,9 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
for child_workload in child_workloads:
|
||||
log.info(f"Validating at branch {child_workload.branch_name}")
|
||||
child_workload.validate(env.pageserver.id)
|
||||
|
||||
# Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
|
||||
@@ -4,9 +4,19 @@ import time
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
BTREE_NUM_CYCLEID_PAGES = """
|
||||
WITH raw_pages AS (
|
||||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
|
||||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
|
||||
WITH lsns AS (
|
||||
/*
|
||||
* pg_switch_wal() ensures we have an LSN that
|
||||
* 1. is after any previous modifications, but also,
|
||||
* 2. (critically) is flushed, preventing any issues with waiting for
|
||||
* unflushed WAL in PageServer.
|
||||
*/
|
||||
SELECT pg_switch_wal() as lsn
|
||||
),
|
||||
raw_pages AS (
|
||||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page
|
||||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno,
|
||||
lsns l(lsn)
|
||||
),
|
||||
parsed_pages AS (
|
||||
/* cycle ID is the last 2 bytes of the btree page */
|
||||
@@ -36,7 +46,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
|
||||
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
|
||||
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert (
|
||||
@@ -57,7 +66,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
ses1.execute("DELETE FROM t WHERE id <= 610;")
|
||||
|
||||
# Flush wal, for checking purposes
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
|
||||
@@ -108,8 +116,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
# unpin the btree page, allowing s3's vacuum to complete
|
||||
ses2.execute("FETCH ALL FROM foo;")
|
||||
ses2.execute("ROLLBACK;")
|
||||
# flush WAL to make sure PS is up-to-date
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
# check that our expectations are correct
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
# NB: there are benchmarks that double-serve as tests inside the `performance` directory.
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct
|
||||
@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"])
|
||||
def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str):
|
||||
def patch_pageserver_toml(config):
|
||||
config["page_service_pipelining"] = {
|
||||
"mode": "pipelined",
|
||||
"max_batch_size": 32,
|
||||
"execution": "concurrent-futures",
|
||||
}
|
||||
|
||||
neon_env_builder.pageserver_config_override = patch_pageserver_toml
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
log.info("make flush appear slow")
|
||||
|
||||
log.info("sending requests until pageserver accepts no more")
|
||||
# TODO: extract this into a helper, like subprocess_capture,
|
||||
# so that we capture the stderr from the helper somewhere.
|
||||
child = subprocess.Popen(
|
||||
[
|
||||
neon_binpath / "test_helper_slow_client_reads",
|
||||
env.pageserver.connstr(),
|
||||
str(env.initial_tenant),
|
||||
str(env.initial_timeline),
|
||||
],
|
||||
bufsize=0, # unbuffered
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
assert child.stdout is not None
|
||||
buf = child.stdout.read(1)
|
||||
if len(buf) != 1:
|
||||
raise Exception("unexpected EOF")
|
||||
if buf != b"R":
|
||||
raise Exception(f"unexpected data: {buf!r}")
|
||||
log.info("helper reports pageserver accepts no more requests")
|
||||
log.info(
|
||||
"assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace"
|
||||
)
|
||||
|
||||
if kind == "pageserver-stop":
|
||||
log.info("try to shut down the pageserver cleanly")
|
||||
env.pageserver.stop()
|
||||
elif kind == "tenant-detach":
|
||||
log.info("try to shut down the tenant")
|
||||
env.pageserver.tenant_detach(env.initial_tenant)
|
||||
else:
|
||||
raise ValueError(f"unexpected kind: {kind}")
|
||||
|
||||
log.info("shutdown did not time out, test passed")
|
||||
@@ -2394,6 +2394,7 @@ def test_storage_controller_node_deletion(
|
||||
Test that deleting a node works & properly reschedules everything that was on the node.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.num_azs = 3
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
@@ -2407,6 +2408,9 @@ def test_storage_controller_node_deletion(
|
||||
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
|
||||
)
|
||||
|
||||
# Sanity check: initial creations should not leave the system in an unstable scheduling state
|
||||
assert env.storage_controller.reconcile_all() == 0
|
||||
|
||||
victim = env.pageservers[-1]
|
||||
|
||||
# The procedure a human would follow is:
|
||||
@@ -3208,6 +3212,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
assert eq_safekeeper_records(body, inserted_now)
|
||||
|
||||
# some small tests for the scheduling policy querying and returning APIs
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Pause"
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Decomissioned"
|
||||
# Ensure idempotency
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
|
||||
|
||||
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
|
||||
compared = [dict(a), dict(b)]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Thread
|
||||
|
||||
import pytest
|
||||
@@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))
|
||||
|
||||
def timeline_create():
|
||||
try:
|
||||
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
|
||||
raise RuntimeError("creation succeeded even though it shouldn't")
|
||||
except ReadTimeout:
|
||||
pass
|
||||
|
||||
Thread(target=timeline_create).start()
|
||||
|
||||
def hit_initdb_upload_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
|
||||
wait_until(hit_initdb_upload_failpoint)
|
||||
|
||||
def creation_connection_timed_out():
|
||||
env.pageserver.assert_log_contains(
|
||||
"POST.*/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
# Wait so that we hit the timeout and the connection is dropped
|
||||
# (But timeline creation still continues)
|
||||
wait_until(creation_connection_timed_out)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
|
||||
raise RuntimeError("creation succeeded even though it shouldn't")
|
||||
|
||||
def tenant_delete():
|
||||
def tenant_delete_inner():
|
||||
@@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
|
||||
|
||||
wait_until(tenant_delete_inner)
|
||||
|
||||
Thread(target=tenant_delete).start()
|
||||
# We will spawn background threads for timeline creation and tenant deletion. They will both
|
||||
# get blocked on our failpoint.
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
create_fut = executor.submit(timeline_create)
|
||||
|
||||
def deletion_arrived():
|
||||
env.pageserver.assert_log_contains(
|
||||
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
|
||||
)
|
||||
def hit_initdb_upload_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
|
||||
wait_until(deletion_arrived)
|
||||
wait_until(hit_initdb_upload_failpoint)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
|
||||
def creation_connection_timed_out():
|
||||
env.pageserver.assert_log_contains(
|
||||
"POST.*/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
# Disable the failpoint and wait for deletion to finish
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
# Wait so that we hit the timeout and the connection is dropped
|
||||
# (But timeline creation still continues)
|
||||
wait_until(creation_connection_timed_out)
|
||||
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
with pytest.raises(ReadTimeout):
|
||||
# Our creation failed from the client's point of view.
|
||||
create_fut.result()
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
|
||||
delete_fut = executor.submit(tenant_delete)
|
||||
|
||||
def deletion_arrived():
|
||||
env.pageserver.assert_log_contains(
|
||||
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
|
||||
)
|
||||
|
||||
wait_until(deletion_arrived)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
|
||||
|
||||
# Disable the failpoint and wait for deletion to finish
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
|
||||
delete_fut.result()
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
|
||||
@@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
io_metrics = query_all_safekeepers(
|
||||
"safekeeper_pg_io_bytes_total",
|
||||
{
|
||||
"app_name": "pageserver",
|
||||
"app_name": f"pageserver-{env.pageserver.id}",
|
||||
"client_az": "test_ps_az",
|
||||
"dir": io_direction,
|
||||
"same_az": "false",
|
||||
|
||||
Reference in New Issue
Block a user