mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-06 18:00:37 +00:00
Compare commits
20 Commits
release-pr
...
proxy_id
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0acc612e3e | ||
|
|
7d761a9d22 | ||
|
|
8bdaee35f3 | ||
|
|
b0f34099f9 | ||
|
|
6975228a76 | ||
|
|
053abff71f | ||
|
|
871e8b325f | ||
|
|
c47c5f4ace | ||
|
|
b0838a68e5 | ||
|
|
8f2ebc0684 | ||
|
|
3a285a046b | ||
|
|
da13154791 | ||
|
|
2e13a3aa7a | ||
|
|
cccc196848 | ||
|
|
e436dcad57 | ||
|
|
21d7b6a258 | ||
|
|
86dbc44db1 | ||
|
|
58f6af6c9a | ||
|
|
7be971081a | ||
|
|
6fe4c6798f |
91
.github/workflows/_check-codestyle-rust.yml
vendored
Normal file
91
.github/workflows/_check-codestyle-rust.yml
vendored
Normal file
@@ -0,0 +1,91 @@
|
||||
name: Check Codestyle Rust
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build-tools-image:
|
||||
description: "build-tools image"
|
||||
required: true
|
||||
type: string
|
||||
archs:
|
||||
description: "Json array of architectures to run on"
|
||||
type: string
|
||||
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
strategy:
|
||||
matrix:
|
||||
arch: ${{ fromJson(inputs.archs) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
78
.github/workflows/build_and_test.yml
vendored
78
.github/workflows/build_and_test.yml
vendored
@@ -164,77 +164,11 @@ jobs:
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
#
|
||||
# The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
|
||||
# #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
|
||||
# time just for that, so skip "clippy --release".
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
archs: '["x64", "arm64"]'
|
||||
secrets: inherit
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ tag, build-build-tools-image ]
|
||||
@@ -890,7 +824,7 @@ jobs:
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images-dev:
|
||||
needs: [ check-permissions, tag, vm-compute-node-image ]
|
||||
needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
|
||||
33
.github/workflows/pre-merge-checks.yml
vendored
33
.github/workflows/pre-merge-checks.yml
vendored
@@ -1,6 +1,12 @@
|
||||
name: Pre-merge checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/_check-codestyle-python.yml
|
||||
- .github/workflows/_check-codestyle-rust.yml
|
||||
- .github/workflows/build-build-tools-image.yml
|
||||
- .github/workflows/pre-merge-checks.yml
|
||||
merge_group:
|
||||
branches:
|
||||
- main
|
||||
@@ -17,8 +23,10 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
python-changed: ${{ steps.python-src.outputs.any_changed }}
|
||||
rust-changed: ${{ steps.rust-src.outputs.any_changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: python-src
|
||||
with:
|
||||
@@ -30,11 +38,25 @@ jobs:
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: rust-src
|
||||
with:
|
||||
files: |
|
||||
.github/workflows/_check-codestyle-rust.yml
|
||||
.github/workflows/build-build-tools-image.yml
|
||||
.github/workflows/pre-merge-checks.yml
|
||||
**/**.rs
|
||||
**/Cargo.toml
|
||||
Cargo.toml
|
||||
Cargo.lock
|
||||
|
||||
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
|
||||
env:
|
||||
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
|
||||
RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
|
||||
run: |
|
||||
echo "${PYTHON_CHANGED_FILES}"
|
||||
echo "${RUST_CHANGED_FILES}"
|
||||
|
||||
build-build-tools-image:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
@@ -55,6 +77,16 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-rust:
|
||||
if: needs.get-changed-files.outputs.rust-changed == 'true'
|
||||
needs: [ get-changed-files, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
# `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
|
||||
archs: '["x64"]'
|
||||
secrets: inherit
|
||||
|
||||
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
|
||||
# Currently we require 2 jobs (checks with exact name):
|
||||
# - conclusion
|
||||
@@ -67,6 +99,7 @@ jobs:
|
||||
needs:
|
||||
- get-changed-files
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Create fake `neon-cloud-e2e` check
|
||||
|
||||
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1312,6 +1312,7 @@ dependencies = [
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"vm_monitor",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
|
||||
@@ -995,24 +995,50 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-build"
|
||||
# Compile "pgx_ulid" extension
|
||||
# Compile "pgx_ulid" extension for v16 and below
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-pgx-ulid-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# doesn't support v17 yet
|
||||
# https://github.com/pksunkara/pgx_ulid/pull/52
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-pgrx12-build"
|
||||
# Compile "pgx_ulid" extension for v17 and up
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1157,6 +1183,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
@@ -51,6 +51,7 @@ tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
prometheus.workspace = true
|
||||
|
||||
postgres_initdb.workspace = true
|
||||
|
||||
@@ -17,7 +17,8 @@ use crate::{
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct ExtensionServerParams {
|
||||
is_library: Option<bool>,
|
||||
#[serde(default)]
|
||||
is_library: bool,
|
||||
}
|
||||
|
||||
/// Download a remote extension.
|
||||
@@ -51,7 +52,7 @@ pub(in crate::http) async fn download_extension(
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
params.is_library.unwrap_or(false),
|
||||
params.is_library,
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
)
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
use std::{
|
||||
net::{IpAddr, Ipv6Addr, SocketAddr},
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
sync::Arc,
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use axum::{
|
||||
extract::Request,
|
||||
middleware::{self, Next},
|
||||
response::{IntoResponse, Response},
|
||||
routing::{get, post},
|
||||
Router,
|
||||
@@ -17,11 +16,9 @@ use axum::{
|
||||
use http::StatusCode;
|
||||
use tokio::net::TcpListener;
|
||||
use tower::ServiceBuilder;
|
||||
use tower_http::{
|
||||
request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
|
||||
trace::TraceLayer,
|
||||
};
|
||||
use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
|
||||
use tracing::{debug, error, info, Span};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
@@ -34,30 +31,24 @@ async fn handle_404() -> Response {
|
||||
StatusCode::NOT_FOUND.into_response()
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct ComputeMakeRequestId(Arc<AtomicU64>);
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
impl MakeRequestId for ComputeMakeRequestId {
|
||||
fn make_request_id<B>(
|
||||
&mut self,
|
||||
_request: &http::Request<B>,
|
||||
) -> Option<tower_http::request_id::RequestId> {
|
||||
let request_id = self
|
||||
.0
|
||||
.fetch_add(1, Ordering::SeqCst)
|
||||
.to_string()
|
||||
.parse()
|
||||
.unwrap();
|
||||
/// This middleware function allows compute_ctl to generate its own request ID
|
||||
/// if one isn't supplied. The control plane will always send one as a UUID. The
|
||||
/// neon Postgres extension on the other hand does not send one.
|
||||
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
|
||||
let headers = request.headers_mut();
|
||||
|
||||
Some(RequestId::new(request_id))
|
||||
if headers.get(X_REQUEST_ID).is_none() {
|
||||
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
|
||||
}
|
||||
|
||||
next.run(request).await
|
||||
}
|
||||
|
||||
/// Run the HTTP server and wait on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
let mut app = Router::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
@@ -82,9 +73,8 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
.fallback(handle_404)
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
.layer(SetRequestIdLayer::x_request_id(
|
||||
ComputeMakeRequestId::default(),
|
||||
))
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
|
||||
@@ -298,14 +298,7 @@ impl FromStr for SkSchedulingPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self(SkSchedulingPolicy::Active)),
|
||||
"disabled" => Ok(Self(SkSchedulingPolicy::Disabled)),
|
||||
"decomissioned" => Ok(Self(SkSchedulingPolicy::Decomissioned)),
|
||||
_ => Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,disabled,decomissioned"
|
||||
)),
|
||||
}
|
||||
SkSchedulingPolicy::from_str(s).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -324,7 +324,7 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum SkSchedulingPolicy {
|
||||
Active,
|
||||
Disabled,
|
||||
Pause,
|
||||
Decomissioned,
|
||||
}
|
||||
|
||||
@@ -334,9 +334,13 @@ impl FromStr for SkSchedulingPolicy {
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(match s {
|
||||
"active" => Self::Active,
|
||||
"disabled" => Self::Disabled,
|
||||
"pause" => Self::Pause,
|
||||
"decomissioned" => Self::Decomissioned,
|
||||
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,pause,decomissioned"
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -346,7 +350,7 @@ impl From<SkSchedulingPolicy> for String {
|
||||
use SkSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Disabled => "disabled",
|
||||
Pause => "pause",
|
||||
Decomissioned => "decomissioned",
|
||||
}
|
||||
.to_string()
|
||||
|
||||
@@ -33,7 +33,6 @@ use crate::{
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
@@ -1400,6 +1399,8 @@ pub enum PagestreamFeMessage {
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -1411,6 +1412,22 @@ pub enum PagestreamBeMessage {
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamFeMessageTag {
|
||||
Exists = 0,
|
||||
Nblocks = 1,
|
||||
GetPage = 2,
|
||||
DbSize = 3,
|
||||
GetSlruSegment = 4,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 99,
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
@@ -1422,7 +1439,28 @@ enum PagestreamBeMessageTag {
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
/* future tags above this line */
|
||||
/// For testing purposes, not available in production.
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 199,
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamFeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
0 => Ok(PagestreamFeMessageTag::Exists),
|
||||
1 => Ok(PagestreamFeMessageTag::Nblocks),
|
||||
2 => Ok(PagestreamFeMessageTag::GetPage),
|
||||
3 => Ok(PagestreamFeMessageTag::DbSize),
|
||||
4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
99 => Ok(PagestreamFeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
@@ -1433,6 +1471,8 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
199 => Ok(PagestreamBeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
@@ -1550,6 +1590,20 @@ pub struct PagestreamDbSizeResponse {
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct PagestreamTestRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub batch_key: u64,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamTestResponse {
|
||||
pub req: PagestreamTestRequest,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -1569,7 +1623,7 @@ impl PagestreamFeMessage {
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(0);
|
||||
bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1580,7 +1634,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(1);
|
||||
bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1591,7 +1645,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(2);
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1603,7 +1657,7 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(3);
|
||||
bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
@@ -1611,13 +1665,24 @@ impl PagestreamFeMessage {
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(4);
|
||||
bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(req) => {
|
||||
bytes.put_u8(PagestreamFeMessageTag::Test as u8);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.batch_key);
|
||||
let message = req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -1645,56 +1710,66 @@ impl PagestreamFeMessage {
|
||||
),
|
||||
};
|
||||
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
match PagestreamFeMessageTag::try_from(msg_tag)
|
||||
.map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
|
||||
{
|
||||
PagestreamFeMessageTag::Exists => {
|
||||
Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::Nblocks => {
|
||||
Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetPage => {
|
||||
Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::DbSize => {
|
||||
Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
relnode: body.read_u32::<BigEndian>()?,
|
||||
forknum: body.read_u8()?,
|
||||
},
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
}))
|
||||
}
|
||||
PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
@@ -1705,7 +1780,21 @@ impl PagestreamFeMessage {
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key: body.read_u64::<BigEndian>()?,
|
||||
message: {
|
||||
let len = body.read_u64::<BigEndian>()?;
|
||||
let mut buf = vec![0; len as usize];
|
||||
body.read_exact(&mut buf)?;
|
||||
String::from_utf8(buf)?
|
||||
},
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1748,6 +1837,15 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
PagestreamProtocolVersion::V3 => {
|
||||
@@ -1816,6 +1914,18 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1958,6 +2068,28 @@ impl PagestreamBeMessage {
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Tag::Test => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let batch_key = buf.read_u64::<BigEndian>()?;
|
||||
let len = buf.read_u64::<BigEndian>()?;
|
||||
let mut msg = vec![0; len as usize];
|
||||
buf.read_exact(&mut msg)?;
|
||||
let message = String::from_utf8(msg)?;
|
||||
Self::Test(PagestreamTestResponse {
|
||||
req: PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key,
|
||||
message,
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
@@ -1977,6 +2109,8 @@ impl PagestreamBeMessage {
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(_) => "Test",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -277,3 +277,8 @@ pub struct TimelineTermBumpResponse {
|
||||
pub previous_term: u64,
|
||||
pub current_term: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SafekeeperUtilization {
|
||||
pub timeline_count: u64,
|
||||
}
|
||||
|
||||
54
libs/utils/src/guard_arc_swap.rs
Normal file
54
libs/utils/src/guard_arc_swap.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
|
||||
//! don't block reads.
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::TryLockError;
|
||||
|
||||
pub struct GuardArcSwap<T> {
|
||||
inner: ArcSwap<T>,
|
||||
guard: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
pub struct Guard<'a, T> {
|
||||
_guard: tokio::sync::MutexGuard<'a, ()>,
|
||||
inner: &'a ArcSwap<T>,
|
||||
}
|
||||
|
||||
impl<T> GuardArcSwap<T> {
|
||||
pub fn new(inner: T) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(inner)),
|
||||
guard: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub async fn write_guard(&self) -> Guard<'_, T> {
|
||||
Guard {
|
||||
_guard: self.guard.lock().await,
|
||||
inner: &self.inner,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
|
||||
let guard = self.guard.try_lock()?;
|
||||
Ok(Guard {
|
||||
_guard: guard,
|
||||
inner: &self.inner,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Guard<'_, T> {
|
||||
pub fn read(&self) -> Arc<T> {
|
||||
self.inner.load_full()
|
||||
}
|
||||
|
||||
pub fn write(&mut self, value: T) {
|
||||
self.inner.store(Arc::new(value));
|
||||
}
|
||||
}
|
||||
@@ -98,6 +98,8 @@ pub mod try_rcu;
|
||||
|
||||
pub mod pprof;
|
||||
|
||||
pub mod guard_arc_swap;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -114,3 +114,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
harness = false
|
||||
|
||||
[[bin]]
|
||||
name = "test_helper_slow_client_reads"
|
||||
required-features = [ "testing" ]
|
||||
|
||||
@@ -4,6 +4,9 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = [ "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use futures::SinkExt;
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
@@ -10,7 +13,6 @@ use pageserver_api::{
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -62,15 +64,28 @@ impl Client {
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
|
||||
ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
},
|
||||
)));
|
||||
Ok(PagestreamClient {
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
sink: PagestreamSender {
|
||||
shared: shared.clone(),
|
||||
sink,
|
||||
},
|
||||
stream: PagestreamReceiver {
|
||||
shared: shared.clone(),
|
||||
stream,
|
||||
},
|
||||
shared,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -97,7 +112,28 @@ impl Client {
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: PagestreamSender,
|
||||
stream: PagestreamReceiver,
|
||||
}
|
||||
|
||||
pub struct PagestreamSender {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
|
||||
}
|
||||
|
||||
pub struct PagestreamReceiver {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
|
||||
}
|
||||
|
||||
enum PagestreamShared {
|
||||
ConnTaskRunning(ConnTaskRunning),
|
||||
ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
}
|
||||
struct ConnTaskRunning {
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
@@ -110,11 +146,11 @@ pub struct RelTagBlockNo {
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(self) {
|
||||
let Self {
|
||||
copy_both,
|
||||
cancel_on_client_drop: cancel_conn_task,
|
||||
conn_task,
|
||||
} = self;
|
||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
shared,
|
||||
sink,
|
||||
stream,
|
||||
} = { self };
|
||||
// The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
||||
//
|
||||
@@ -131,27 +167,77 @@ impl PagestreamClient {
|
||||
//
|
||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
||||
// => https://github.com/neondatabase/neon/issues/6390
|
||||
let _ = cancel_conn_task.unwrap();
|
||||
let ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
} = {
|
||||
let mut guard = shared.lock().unwrap();
|
||||
match std::mem::replace(
|
||||
&mut *guard,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
) {
|
||||
PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
|
||||
}
|
||||
};
|
||||
let _ = cancel_on_client_drop.unwrap();
|
||||
conn_task.await.unwrap();
|
||||
drop(copy_both);
|
||||
|
||||
// Now drop the split copy_both.
|
||||
drop(sink);
|
||||
drop(stream);
|
||||
}
|
||||
|
||||
pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
|
||||
let Self {
|
||||
shared: _,
|
||||
sink,
|
||||
stream,
|
||||
} = self;
|
||||
(sink, stream)
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
self.getpage_send(req).await?;
|
||||
self.getpage_recv().await
|
||||
}
|
||||
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.sink.getpage_send(req).await
|
||||
}
|
||||
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.stream.getpage_recv().await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamSender {
|
||||
// TODO: maybe make this impl Sink instead for better composability?
|
||||
pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
|
||||
let msg = msg.serialize();
|
||||
self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.send(PagestreamFeMessage::GetPage(req)).await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamReceiver {
|
||||
// TODO: maybe make this impl Stream instead for better composability?
|
||||
pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
PagestreamBeMessage::deserialize(next)
|
||||
}
|
||||
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let next: PagestreamBeMessage = self.recv().await?;
|
||||
match next {
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
@@ -160,7 +246,14 @@ impl PagestreamClient {
|
||||
| PagestreamBeMessage::GetSlruSegment(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
msg.kind()
|
||||
next.kind()
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamBeMessage::Test(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
65
pageserver/src/bin/test_helper_slow_client_reads.rs
Normal file
65
pageserver/src/bin/test_helper_slow_client_reads.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use std::{
|
||||
io::{stdin, stdout, Read, Write},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
connstr: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let Args {
|
||||
connstr,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = Args::parse();
|
||||
let client = pageserver_client::page_service::Client::new(connstr).await?;
|
||||
let client = client.pagestream(tenant_id, timeline_id).await?;
|
||||
let (mut sender, _receiver) = client.split();
|
||||
|
||||
eprintln!("filling the pipe");
|
||||
let mut msg = 0;
|
||||
loop {
|
||||
msg += 1;
|
||||
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
|
||||
PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(23),
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
},
|
||||
));
|
||||
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
|
||||
eprintln!("pipe seems full");
|
||||
break;
|
||||
};
|
||||
let _: () = res?;
|
||||
}
|
||||
|
||||
let n = stdout().write(b"R")?;
|
||||
assert_eq!(n, 1);
|
||||
stdout().flush()?;
|
||||
|
||||
eprintln!("waiting for signal to tell us to exit");
|
||||
|
||||
let mut buf = [0u8; 1];
|
||||
stdin().read_exact(&mut buf)?;
|
||||
|
||||
eprintln!("termination signal received, exiting");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -100,6 +100,32 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_concurrent_initdb",
|
||||
"Number of initdb processes running"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_initdb_semaphore_seconds_global",
|
||||
"Time spent getting a permit from the global initdb semaphore",
|
||||
STORAGE_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define metric")
|
||||
});
|
||||
|
||||
pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_initdb_seconds_global",
|
||||
"Time spent performing initdb",
|
||||
STORAGE_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
#[derive(
|
||||
Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
|
||||
@@ -1463,6 +1489,8 @@ pub enum SmgrQueryType {
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
GetSlruSegment,
|
||||
#[cfg(feature = "testing")]
|
||||
Test,
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
|
||||
@@ -555,37 +555,52 @@ struct BatchedGetPageRequest {
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct BatchedTestRequest {
|
||||
req: models::PagestreamTestRequest,
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
|
||||
/// so that we don't keep the [`Timeline::gate`] open while the batch
|
||||
/// is being built up inside the [`spsc_fold`] (pagestream pipelining).
|
||||
enum BatchedFeMessage {
|
||||
Exists {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamExistsRequest,
|
||||
},
|
||||
Nblocks {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamNblocksRequest,
|
||||
},
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
},
|
||||
DbSize {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamDbSizeRequest,
|
||||
},
|
||||
GetSlruSegment {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamGetSlruSegmentRequest,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
span: Span,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
},
|
||||
RespondError {
|
||||
span: Span,
|
||||
error: BatchedPageStreamError,
|
||||
@@ -606,6 +621,12 @@ impl BatchedFeMessage {
|
||||
page.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test { requests, .. } => {
|
||||
for req in requests {
|
||||
req.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
}
|
||||
}
|
||||
@@ -735,7 +756,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -754,7 +775,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::Nblocks {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -773,7 +794,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::DbSize {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -792,7 +813,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
req,
|
||||
}
|
||||
}
|
||||
@@ -844,6 +865,7 @@ impl PageServerHandler {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// We're holding the Handle
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
req.hdr.request_lsn,
|
||||
@@ -861,11 +883,27 @@ impl PageServerHandler {
|
||||
};
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
shard: shard.downgrade(),
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessage::Test(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer =
|
||||
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
|
||||
.await?;
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
requests: vec![BatchedTestRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Some(batched_msg))
|
||||
}
|
||||
@@ -907,9 +945,7 @@ impl PageServerHandler {
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
|
||||
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
|
||||
{
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
@@ -928,6 +964,44 @@ impl PageServerHandler {
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
Ok(BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
}),
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) if (|| {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
@@ -969,7 +1043,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_rel_exists_request(&shard, &req, ctx)
|
||||
.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -986,7 +1060,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_nblocks_request(&shard, &req, ctx)
|
||||
.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1007,7 +1081,7 @@ impl PageServerHandler {
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
&*shard.upgrade()?,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
ctx,
|
||||
@@ -1029,7 +1103,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
(
|
||||
vec![self
|
||||
.handle_db_size_request(&shard, &req, ctx)
|
||||
.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1046,7 +1120,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_slru_segment_request(&shard, &req, ctx)
|
||||
.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1054,6 +1128,27 @@ impl PageServerHandler {
|
||||
span,
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard,
|
||||
requests,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::test");
|
||||
(
|
||||
{
|
||||
let npages = requests.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::RespondError { span, error } => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
@@ -1791,6 +1886,51 @@ impl PageServerHandler {
|
||||
))
|
||||
}
|
||||
|
||||
// NB: this impl mimics what we do for batched getpage requests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_test_request_batch(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
// real requests would do something with the timeline
|
||||
let mut results = Vec::with_capacity(requests.len());
|
||||
for _req in requests.iter() {
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
results.push({
|
||||
if timeline.cancel.is_cancelled() {
|
||||
Err(PageReconstructError::Cancelled)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: avoid creating the new Vec here
|
||||
Vec::from_iter(
|
||||
requests
|
||||
.into_iter()
|
||||
.zip(results.into_iter())
|
||||
.map(|(req, res)| {
|
||||
res.map(|()| {
|
||||
(
|
||||
PagestreamBeMessage::Test(models::PagestreamTestResponse {
|
||||
req: req.req.clone(),
|
||||
}),
|
||||
req.timer,
|
||||
)
|
||||
})
|
||||
.map_err(|e| BatchedPageStreamError {
|
||||
err: PageStreamError::from(e),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// Note on "fullbackup":
|
||||
/// Full basebackups should only be used for debugging purposes.
|
||||
/// Originally, it was introduced to enable breaking storage format changes,
|
||||
@@ -2406,6 +2546,14 @@ impl From<GetActiveTimelineError> for QueryError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::timeline::handle::HandleUpgradeError> for QueryError {
|
||||
fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self {
|
||||
match e {
|
||||
crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_tracing_field_shard_id(timeline: &Timeline) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
tracing::Span::current().record(
|
||||
|
||||
@@ -95,6 +95,9 @@ use crate::deletion_queue::DeletionQueueError;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::l0_flush::L0FlushGlobalState;
|
||||
use crate::metrics::CONCURRENT_INITDBS;
|
||||
use crate::metrics::INITDB_RUN_TIME;
|
||||
use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME;
|
||||
use crate::metrics::TENANT;
|
||||
use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
@@ -5347,8 +5350,17 @@ async fn run_initdb(
|
||||
initdb_bin_path, initdb_target_dir, initdb_lib_dir,
|
||||
);
|
||||
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
let _permit = {
|
||||
let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
|
||||
INIT_DB_SEMAPHORE.acquire().await
|
||||
};
|
||||
|
||||
CONCURRENT_INITDBS.inc();
|
||||
scopeguard::defer! {
|
||||
CONCURRENT_INITDBS.dec();
|
||||
}
|
||||
|
||||
let _timer = INITDB_RUN_TIME.start_timer();
|
||||
let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
|
||||
superuser: &conf.superuser,
|
||||
locale: &conf.locale,
|
||||
|
||||
@@ -382,6 +382,12 @@ pub(crate) struct RemoteTimelineClient {
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClient {
|
||||
fn drop(&mut self) {
|
||||
debug!("dropping RemoteTimelineClient");
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
@@ -797,6 +803,12 @@ impl RemoteTimelineClient {
|
||||
|
||||
upload_queue.dirty.metadata.apply(update);
|
||||
|
||||
// Defense in depth: if we somehow generated invalid metadata, do not persist it.
|
||||
upload_queue
|
||||
.dirty
|
||||
.validate()
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -152,6 +152,21 @@ impl IndexPart {
|
||||
};
|
||||
is_same_remote_layer_path(name, metadata, name, index_metadata)
|
||||
}
|
||||
|
||||
/// Check for invariants in the index: this is useful when uploading an index to ensure that if
|
||||
/// we encounter a bug, we do not persist buggy metadata.
|
||||
pub(crate) fn validate(&self) -> Result<(), String> {
|
||||
if self.import_pgdata.is_none()
|
||||
&& self.metadata.ancestor_timeline().is_none()
|
||||
&& self.layer_metadata.is_empty()
|
||||
{
|
||||
// Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
|
||||
// always have at least one layer.
|
||||
return Err("Index has no ancestor and no layers".to_string());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
|
||||
@@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part(
|
||||
});
|
||||
pausable_failpoint!("before-upload-index-pausable");
|
||||
|
||||
// Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this
|
||||
// (this should never happen)
|
||||
index_part.validate().map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
// FIXME: this error comes too late
|
||||
let serialized = index_part.to_json_bytes()?;
|
||||
let serialized = Bytes::from(serialized);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use pageserver_api::key::CONTROLFILE_KEY;
|
||||
use pageserver_api::key::{Key, CONTROLFILE_KEY};
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{
|
||||
completion::{self, Completion},
|
||||
@@ -9,7 +9,10 @@ use utils::{
|
||||
|
||||
use super::failpoints::{Failpoint, FailpointKind};
|
||||
use super::*;
|
||||
use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
|
||||
};
|
||||
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
@@ -31,20 +34,51 @@ async fn smoke_test() {
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
|
||||
let image_layers = vec![(
|
||||
Lsn(0x40),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
test_img("foo"),
|
||||
)],
|
||||
)];
|
||||
|
||||
// Create a test timeline with one real layer, and one synthetic test layer. The synthetic
|
||||
// one is only there so that we can GC the real one without leaving the timeline's metadata
|
||||
// empty, which is an illegal state (see [`IndexPart::validate`]).
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Default::default(),
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
// Grab one of the timeline's layers to exercise in the test, and the other layer that is just
|
||||
// there to avoid the timeline being illegally empty
|
||||
let (layer, dummy_layer) = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
layers.swap_remove(0)
|
||||
layers.sort_by_key(|l| l.layer_desc().get_key_range().start);
|
||||
let synthetic_layer = layers.pop().unwrap();
|
||||
let real_layer = layers.pop().unwrap();
|
||||
tracing::info!(
|
||||
"real_layer={:?} ({}), synthetic_layer={:?} ({})",
|
||||
real_layer,
|
||||
real_layer.layer_desc().file_size,
|
||||
synthetic_layer,
|
||||
synthetic_layer.layer_desc().file_size
|
||||
);
|
||||
(real_layer, synthetic_layer)
|
||||
};
|
||||
|
||||
// all layers created at pageserver are like `layer`, initialized with strong
|
||||
@@ -173,10 +207,13 @@ async fn smoke_test() {
|
||||
|
||||
let rtc = &timeline.remote_client;
|
||||
|
||||
// Simulate GC removing our test layer.
|
||||
{
|
||||
let layers = &[layer];
|
||||
let mut g = timeline.layers.write().await;
|
||||
|
||||
let layers = &[layer];
|
||||
g.open_mut().unwrap().finish_gc_timeline(layers);
|
||||
|
||||
// this just updates the remote_physical_size for demonstration purposes
|
||||
rtc.schedule_gc_update(layers).unwrap();
|
||||
}
|
||||
@@ -191,7 +228,10 @@ async fn smoke_test() {
|
||||
|
||||
rtc.wait_completion().await.unwrap();
|
||||
|
||||
assert_eq!(rtc.get_remote_physical_size(), 0);
|
||||
assert_eq!(
|
||||
rtc.get_remote_physical_size(),
|
||||
dummy_layer.metadata().file_size
|
||||
);
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
|
||||
@@ -51,7 +51,9 @@ use tokio::{
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
fs_ext, pausable_failpoint,
|
||||
fs_ext,
|
||||
guard_arc_swap::GuardArcSwap,
|
||||
pausable_failpoint,
|
||||
postgres_client::PostgresClientProtocol,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
@@ -74,6 +76,7 @@ use std::{pin::pin, sync::OnceLock};
|
||||
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
page_service::TenantManagerTypes,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -353,8 +356,8 @@ pub struct Timeline {
|
||||
// though let's keep them both for better error visibility.
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// When did we last calculate the partitioning? Make it pub to test cases.
|
||||
pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
/// The repartitioning result. Allows a single writer and multiple readers.
|
||||
pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -429,7 +432,7 @@ pub struct Timeline {
|
||||
|
||||
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||
pub(crate) handles: handle::PerTimelineState<TenantManagerTypes>,
|
||||
|
||||
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
|
||||
@@ -2340,7 +2343,8 @@ impl Timeline {
|
||||
// initial logical size is 0.
|
||||
LogicalSize::empty_initial()
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((
|
||||
|
||||
partitioning: GuardArcSwap::new((
|
||||
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
|
||||
Lsn(0),
|
||||
)),
|
||||
@@ -4028,18 +4032,15 @@ impl Timeline {
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
let Ok(mut guard) = self.partitioning.try_write_guard() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
// Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
|
||||
// gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
|
||||
// heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called concurrently, this is rare and a retry should be fine"
|
||||
"repartition() called concurrently"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
@@ -4067,9 +4068,9 @@ impl Timeline {
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
}; // no partitioning for metadata keys for now
|
||||
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
|
||||
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
|
||||
let result = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
guard.write(result.clone());
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
@@ -4625,6 +4626,10 @@ impl Drop for Timeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Timeline {} for tenant {} is being dropped",
|
||||
self.timeline_id, self.tenant_shard_id.tenant_id
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5673,9 +5678,17 @@ impl Timeline {
|
||||
info!("force created image layer {}", image_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.open_mut().unwrap().force_insert_layer(image_layer);
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
.force_insert_layer(image_layer.clone());
|
||||
}
|
||||
|
||||
// Update remote_timeline_client state to reflect existence of this layer
|
||||
self.remote_client
|
||||
.schedule_layer_file_upload(image_layer)
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5726,9 +5739,17 @@ impl Timeline {
|
||||
info!("force created delta layer {}", delta_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.open_mut().unwrap().force_insert_layer(delta_layer);
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
.force_insert_layer(delta_layer.clone());
|
||||
}
|
||||
|
||||
// Update remote_timeline_client state to reflect existence of this layer
|
||||
self.remote_client
|
||||
.schedule_layer_file_upload(delta_layer)
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -2146,12 +2146,7 @@ impl Timeline {
|
||||
let mut compact_jobs = Vec::new();
|
||||
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
|
||||
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
|
||||
let ((dense_ks, sparse_ks), _) = {
|
||||
let Ok(partition) = self.partitioning.try_lock() else {
|
||||
bail!("failed to acquire partition lock during gc-compaction");
|
||||
};
|
||||
partition.clone()
|
||||
};
|
||||
let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
|
||||
// Truncate the key range to be within user specified compaction range.
|
||||
fn truncate_to(
|
||||
source_start: &Key,
|
||||
|
||||
@@ -32,54 +32,151 @@
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! ## Data Structures
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
|
||||
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
|
||||
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
|
||||
//!
|
||||
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
|
||||
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
|
||||
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
|
||||
//!
|
||||
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
|
||||
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
|
||||
//! The `HandleInner` is allocated as a `Arc<Mutex<HandleInner>>` and
|
||||
//! referenced weakly and strongly from various places which we are now illustrating.
|
||||
//! For brevity, we will omit the `Arc<Mutex<>>` part in the following and instead
|
||||
//! use `strong ref` and `weak ref` when referring to the `Arc<Mutex<HandleInner>>`
|
||||
//! or `Weak<Mutex<HandleInner>>`, respectively.
|
||||
//!
|
||||
//! - The `Handle` is a strong ref.
|
||||
//! - The `WeakHandle` is a weak ref.
|
||||
//! - The `PerTimelineState` contains a `HashMap<CacheId, strong ref>`.
|
||||
//! - The `Cache` is a `HashMap<unique identifier for the shard, weak ref>`.
|
||||
//!
|
||||
//! Lifetimes:
|
||||
//! - `WeakHandle` and `Handle`: single pagestream request.
|
||||
//! - `Cache`: single page service connection.
|
||||
//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`).
|
||||
//!
|
||||
//! ## Request Handling Flow (= filling and using the `Cache``)
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
|
||||
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
|
||||
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
|
||||
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
|
||||
//! and a strong ref in the `PerTimelineState`.
|
||||
//! A strong ref is returned wrapped in a `Handle`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the `Weak<HandleInner>` in the cache.
|
||||
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
|
||||
//! and find the weak ref in the cache.
|
||||
//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`.
|
||||
//!
|
||||
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! The pagestream processing is pipelined and involves a batching step.
|
||||
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
|
||||
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
|
||||
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//!
|
||||
//! # Memory Management / How The Reference Cycle Is Broken
|
||||
//! # Performance
|
||||
//!
|
||||
//! The attentive reader may have noticed the strong reference cycle
|
||||
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
|
||||
//! Remember from the introductory section:
|
||||
//!
|
||||
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
|
||||
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
|
||||
//! > However, we want to avoid the overhead of entering the gate for every
|
||||
//! > method invocation.
|
||||
//!
|
||||
//! Why do we want to avoid that?
|
||||
//! Because the gate is a shared location in memory and entering it involves
|
||||
//! bumping refcounts, which leads to cache contention if done frequently
|
||||
//! from multiple cores in parallel.
|
||||
//!
|
||||
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
|
||||
//! That `Arc` is private to the `HandleInner` and hence to the connection.
|
||||
//! (Review the "Data Structures" section if that is unclear to you.)
|
||||
//!
|
||||
//! A `WeakHandle` is a weak ref to the `HandleInner`.
|
||||
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
|
||||
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
|
||||
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
|
||||
//!
|
||||
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
|
||||
//! Again, this is cheap because the `Arc` is private to the connection.
|
||||
//!
|
||||
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
|
||||
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
|
||||
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
|
||||
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
|
||||
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
|
||||
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
|
||||
//!
|
||||
//! # Shutdown
|
||||
//!
|
||||
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! Further, there is this cycle:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! The former cycle is a memory leak if not broken.
|
||||
//! The latter cycle further prevents the Timeline from shutting down
|
||||
//! because we certainly won't drop the Timeline while the GateGuard is alive.
|
||||
//! Preventing shutdown is the whole point of this handle/cache system,
|
||||
//! but when the Timeline needs to shut down, we need to break the cycle.
|
||||
//!
|
||||
//! The cycle is broken by either
|
||||
//! - `PerTimelineState::shutdown` or
|
||||
//! - dropping the `Cache`.
|
||||
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
|
||||
//! - Connection shutdown (=> dropping the `Cache`).
|
||||
//!
|
||||
//! Concurrently existing `Handle`s will extend the existence of the cycle.
|
||||
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
|
||||
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
|
||||
//! `Arc<GateGuard>`.
|
||||
//!
|
||||
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
|
||||
//! thereby breaking the cycle.
|
||||
//! It also initiates draining of already existing `Handle`s by
|
||||
//! poisoning things so that no new `HandleInner`'s can be added
|
||||
//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail.
|
||||
//!
|
||||
//! Concurrently existing / already upgraded `Handle`s will extend the
|
||||
//! lifetime of the `Arc<Mutex<HandleInner>>` and hence cycles.
|
||||
//! However, since `Handle`s are short-lived and new `Handle`s are not
|
||||
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
|
||||
//! that extension of the cycle is bounded.
|
||||
//! handed out from `Cache::get` or `WeakHandle::upgrade` after
|
||||
//! `PerTimelineState::shutdown`, that extension of the cycle is bounded.
|
||||
//!
|
||||
//! Concurrently existing `WeakHandle`s will fail to `upgrade()`:
|
||||
//! while they will succeed in upgrading `Weak<Mutex<HandleInner>>`,
|
||||
//! they will find the inner in state `HandleInner::ShutDown` state where the
|
||||
//! `Arc<GateGuard>` and Timeline has already been dropped.
|
||||
//!
|
||||
//! Dropping the `Cache` undoes the registration of this `Cache`'s
|
||||
//! `HandleInner`s from all the `PerTimelineState`s, i.e., it
|
||||
//! removes the strong ref to each of its `HandleInner`s
|
||||
//! from all the `PerTimelineState`.
|
||||
//!
|
||||
//! # Locking Rules
|
||||
//!
|
||||
//! To prevent deadlocks we:
|
||||
//!
|
||||
//! 1. Only ever hold one of the locks at a time.
|
||||
//! 2. Don't add more than one Drop impl that locks on the
|
||||
//! cycles above.
|
||||
//!
|
||||
//! As per (2), that impl is in `Drop for Cache`.
|
||||
//!
|
||||
//! # Fast Path for Shard Routing
|
||||
//!
|
||||
//! The `Cache` has a fast path for shard routing to avoid calling into
|
||||
//! the tenant manager for every request.
|
||||
//!
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s.
|
||||
//!
|
||||
//! The current implementation uses the first entry in the hash map
|
||||
//! to determine the `ShardParameters` and derive the correct
|
||||
@@ -87,18 +184,18 @@
|
||||
//!
|
||||
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
|
||||
//!
|
||||
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
|
||||
//! If the lookup is successful and the `WeakHandle` can be upgraded,
|
||||
//! it's a hit.
|
||||
//!
|
||||
//! ## Cache invalidation
|
||||
//!
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently if done lazily.
|
||||
//! The only reasons why an entry in the cache can become stale are:
|
||||
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
|
||||
//! being detached, timeline or shard deleted, or pageserver is shutting down.
|
||||
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
|
||||
//!
|
||||
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
|
||||
//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the
|
||||
//! timeline has shut down, and when that happens, we remove the entry from the cache.
|
||||
//!
|
||||
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
|
||||
@@ -107,8 +204,6 @@
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
@@ -152,7 +247,7 @@ pub(crate) struct Cache<T: Types> {
|
||||
map: Map<T>,
|
||||
}
|
||||
|
||||
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
|
||||
type Map<T> = HashMap<ShardTimelineId, WeakHandle<T>>;
|
||||
|
||||
impl<T: Types> Default for Cache<T> {
|
||||
fn default() -> Self {
|
||||
@@ -170,12 +265,22 @@ pub(crate) struct ShardTimelineId {
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
|
||||
struct HandleInner<T: Types> {
|
||||
shut_down: AtomicBool,
|
||||
timeline: T::Timeline,
|
||||
// The timeline's gate held open.
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
pub(crate) struct Handle<T: Types> {
|
||||
timeline: Arc<T::Timeline>,
|
||||
#[allow(dead_code)] // the field exists to keep the gate open
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
inner: Arc<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
pub(crate) struct WeakHandle<T: Types> {
|
||||
inner: Weak<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
enum HandleInner<T: Types> {
|
||||
KeepingTimelineGateOpen {
|
||||
#[allow(dead_code)]
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
timeline: Arc<T::Timeline>,
|
||||
},
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
|
||||
@@ -183,7 +288,8 @@ struct HandleInner<T: Types> {
|
||||
/// See module-level comment for details.
|
||||
pub struct PerTimelineState<T: Types> {
|
||||
// None = shutting down
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
|
||||
#[allow(clippy::type_complexity)]
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<Mutex<HandleInner<T>>>>>>,
|
||||
}
|
||||
|
||||
impl<T: Types> Default for PerTimelineState<T> {
|
||||
@@ -243,49 +349,24 @@ impl<T: Types> Cache<T> {
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
// terminates because each iteration removes an element from the map
|
||||
loop {
|
||||
let handle = self
|
||||
.get_impl(timeline_id, shard_selector, tenant_manager)
|
||||
.await?;
|
||||
if handle.0.shut_down.load(Ordering::Relaxed) {
|
||||
let removed = self
|
||||
.map
|
||||
.remove(&handle.0.timeline.shard_timeline_id())
|
||||
.expect("invariant of get_impl is that the returned handle is in the map");
|
||||
assert!(
|
||||
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
|
||||
"shard_timeline_id() incorrect?"
|
||||
);
|
||||
} else {
|
||||
return Ok(handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
async fn get_impl(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
let miss: ShardSelector = {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let miss: ShardSelector = loop {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
match routing_state {
|
||||
RoutingResult::FastPath(handle) => return Ok(handle),
|
||||
RoutingResult::SlowPath(key) => match self.map.get(&key) {
|
||||
Some(cached) => match cached.upgrade() {
|
||||
Some(upgraded) => return Ok(Handle(upgraded)),
|
||||
None => {
|
||||
Ok(upgraded) => return Ok(upgraded),
|
||||
Err(HandleUpgradeError::ShutDown) => {
|
||||
// TODO: dedup with shard_routing()
|
||||
trace!("handle cache stale");
|
||||
self.map.remove(&key).unwrap();
|
||||
ShardSelector::Known(key.shard_index)
|
||||
continue;
|
||||
}
|
||||
},
|
||||
None => ShardSelector::Known(key.shard_index),
|
||||
None => break ShardSelector::Known(key.shard_index),
|
||||
},
|
||||
RoutingResult::NeedConsultTenantManager => shard_selector,
|
||||
RoutingResult::NeedConsultTenantManager => break shard_selector,
|
||||
}
|
||||
};
|
||||
self.get_miss(timeline_id, miss, tenant_manager).await
|
||||
@@ -302,7 +383,7 @@ impl<T: Types> Cache<T> {
|
||||
let Some((first_key, first_handle)) = self.map.iter().next() else {
|
||||
return RoutingResult::NeedConsultTenantManager;
|
||||
};
|
||||
let Some(first_handle) = first_handle.upgrade() else {
|
||||
let Ok(first_handle) = first_handle.upgrade() else {
|
||||
// TODO: dedup with get()
|
||||
trace!("handle cache stale");
|
||||
let first_key_owned = *first_key;
|
||||
@@ -310,7 +391,7 @@ impl<T: Types> Cache<T> {
|
||||
continue;
|
||||
};
|
||||
|
||||
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
|
||||
let first_handle_shard_identity = first_handle.get_shard_identity();
|
||||
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
|
||||
shard_number: shard_num,
|
||||
shard_count: first_handle_shard_identity.count,
|
||||
@@ -329,11 +410,11 @@ impl<T: Types> Cache<T> {
|
||||
};
|
||||
let first_handle_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: first_handle_shard_identity.shard_index(),
|
||||
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
|
||||
timeline_id: first_handle.shard_timeline_id().timeline_id,
|
||||
};
|
||||
|
||||
if need_shard_timeline_id == first_handle_shard_timeline_id {
|
||||
return RoutingResult::FastPath(Handle(first_handle));
|
||||
return RoutingResult::FastPath(first_handle);
|
||||
} else {
|
||||
return RoutingResult::SlowPath(need_shard_timeline_id);
|
||||
}
|
||||
@@ -357,23 +438,30 @@ impl<T: Types> Cache<T> {
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
let gate_guard = match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
};
|
||||
trace!("creating new HandleInner");
|
||||
let handle = Arc::new(
|
||||
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
|
||||
// so we can identify reference cycle bugs.
|
||||
HandleInner {
|
||||
shut_down: AtomicBool::new(false),
|
||||
_gate_guard: gate_guard,
|
||||
timeline: timeline.clone(),
|
||||
},
|
||||
);
|
||||
let handle = {
|
||||
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
|
||||
gate_guard: Arc::new(
|
||||
// this enter() is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::gate refcounts
|
||||
match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
},
|
||||
),
|
||||
// this clone is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::clone refcounts
|
||||
timeline: Arc::new(timeline.clone()),
|
||||
}));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
let handle = handle_weak
|
||||
.upgrade()
|
||||
.ok()
|
||||
.expect("we just created it and it's not linked anywhere yet");
|
||||
{
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
@@ -381,7 +469,8 @@ impl<T: Types> Cache<T> {
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
|
||||
let replaced =
|
||||
per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
@@ -392,8 +481,7 @@ impl<T: Types> Cache<T> {
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(Arc::downgrade(&handle));
|
||||
handle
|
||||
v.insert(handle_weak);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -401,14 +489,62 @@ impl<T: Types> Cache<T> {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Handle(handle))
|
||||
}
|
||||
Ok(handle)
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum HandleUpgradeError {
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
impl<T: Types> WeakHandle<T> {
|
||||
pub(crate) fn upgrade(&self) -> Result<Handle<T>, HandleUpgradeError> {
|
||||
let Some(inner) = Weak::upgrade(&self.inner) else {
|
||||
return Err(HandleUpgradeError::ShutDown);
|
||||
};
|
||||
let lock_guard = inner.lock().expect("poisoned");
|
||||
match &*lock_guard {
|
||||
HandleInner::KeepingTimelineGateOpen {
|
||||
timeline,
|
||||
gate_guard,
|
||||
} => {
|
||||
let gate_guard = Arc::clone(gate_guard);
|
||||
let timeline = Arc::clone(timeline);
|
||||
drop(lock_guard);
|
||||
Ok(Handle {
|
||||
timeline,
|
||||
gate_guard,
|
||||
inner,
|
||||
})
|
||||
}
|
||||
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_same_handle_as(&self, other: &WeakHandle<T>) -> bool {
|
||||
Weak::ptr_eq(&self.inner, &other.inner)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> Handle<T> {
|
||||
pub(crate) fn downgrade(&self) -> WeakHandle<T> {
|
||||
WeakHandle {
|
||||
inner: Arc::downgrade(&self.inner),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> PerTimelineState<T> {
|
||||
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
@@ -430,43 +566,54 @@ impl<T: Types> PerTimelineState<T> {
|
||||
trace!("already shut down");
|
||||
return;
|
||||
};
|
||||
for handle in handles.values() {
|
||||
for handle_inner_arc in handles.values() {
|
||||
// Make hits fail.
|
||||
handle.shut_down.store(true, Ordering::Relaxed);
|
||||
let mut lock_guard = handle_inner_arc.lock().expect("poisoned");
|
||||
lock_guard.shutdown();
|
||||
}
|
||||
drop(handles);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<T: Types> Drop for HandleInner<T> {
|
||||
fn drop(&mut self) {
|
||||
trace!("HandleInner dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
|
||||
impl<T: Types> Drop for Cache<T> {
|
||||
fn drop(&mut self) {
|
||||
for (_, weak) in self.map.drain() {
|
||||
if let Some(strong) = weak.upgrade() {
|
||||
// handle is still being kept alive in PerTimelineState
|
||||
let timeline = strong.timeline.per_timeline_state();
|
||||
let mut handles = timeline.handles.lock().expect("mutex poisoned");
|
||||
if let Some(handles) = &mut *handles {
|
||||
let Some(removed) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
assert!(Arc::ptr_eq(&removed, &strong));
|
||||
}
|
||||
for (
|
||||
_,
|
||||
WeakHandle {
|
||||
inner: handle_inner_weak,
|
||||
},
|
||||
) in self.map.drain()
|
||||
{
|
||||
let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
|
||||
continue;
|
||||
};
|
||||
let handle_timeline = handle_inner_arc
|
||||
// locking rules: drop lock before acquiring other lock below
|
||||
.lock()
|
||||
.expect("poisoned")
|
||||
.shutdown();
|
||||
let per_timeline_state = handle_timeline.per_timeline_state();
|
||||
let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
|
||||
let Some(handles) = &mut *handles_lock_guard else {
|
||||
continue;
|
||||
};
|
||||
let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
drop(handles_lock_guard); // locking rules: remember them when!
|
||||
assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> HandleInner<T> {
|
||||
fn shutdown(&mut self) -> Arc<T::Timeline> {
|
||||
match std::mem::replace(self, HandleInner::ShutDown) {
|
||||
HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline,
|
||||
HandleInner::ShutDown => {
|
||||
unreachable!("handles are only shut down once in their lifetime");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -474,6 +621,8 @@ impl<T: Types> Drop for Cache<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Weak;
|
||||
|
||||
use pageserver_api::{
|
||||
key::{rel_block_to_key, Key, DBDIR_KEY},
|
||||
models::ShardParameters,
|
||||
@@ -583,39 +732,13 @@ mod tests {
|
||||
//
|
||||
// fill the cache
|
||||
//
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
let handle: Handle<_> = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
let handle_inner_weak = Arc::downgrade(&handle.0);
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
assert_eq!(
|
||||
(
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
Weak::weak_count(&handle_inner_weak)
|
||||
),
|
||||
(2, 2),
|
||||
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
|
||||
);
|
||||
assert_eq!(cache.map.len(), 1);
|
||||
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
//
|
||||
// demonstrate that Handle holds up gate closure
|
||||
@@ -640,21 +763,11 @@ mod tests {
|
||||
// SHUTDOWN
|
||||
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"through local var handle"
|
||||
);
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
1,
|
||||
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(via handle), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// this handle is perfectly usable
|
||||
handle.getpage();
|
||||
@@ -678,16 +791,6 @@ mod tests {
|
||||
}
|
||||
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
0,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"the HandleInner destructor already ran"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// closing gate succeeds after dropping handle
|
||||
tokio::select! {
|
||||
@@ -706,10 +809,8 @@ mod tests {
|
||||
assert_eq!(cache.map.len(), 0);
|
||||
|
||||
// ensure all refs to shard0 are gone and we're not leaking anything
|
||||
let myself = Weak::clone(&shard0.myself);
|
||||
drop(shard0);
|
||||
drop(mgr);
|
||||
assert_eq!(Weak::strong_count(&myself), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -948,15 +1049,11 @@ mod tests {
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.0));
|
||||
used_handles.push(Arc::downgrade(&handle.timeline));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown
|
||||
assert!(used_handles
|
||||
.iter()
|
||||
.all(|weak| Weak::strong_count(weak) == 0));
|
||||
|
||||
// ... thus the gate should close immediately, even without shutdown
|
||||
// No handles exist, thus gates are closed and don't require shutdown.
|
||||
// Thus the gate should close immediately, even without shutdown.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
@@ -964,4 +1061,75 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_weak_handles() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
|
||||
let refcount_start = Arc::strong_count(&shard0);
|
||||
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
|
||||
let weak_handle = handle.downgrade();
|
||||
|
||||
drop(handle);
|
||||
|
||||
let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it");
|
||||
|
||||
// Start shutdown
|
||||
shard0.per_timeline_state.shutdown();
|
||||
|
||||
// Upgrades during shutdown don't work, even if upgraded_handle exists.
|
||||
weak_handle
|
||||
.upgrade()
|
||||
.err()
|
||||
.expect("can't upgrade weak handle as soon as shutdown started");
|
||||
|
||||
// But upgraded_handle is still alive, so the gate won't close.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
|
||||
// Drop the last handle.
|
||||
drop(upgraded_handle);
|
||||
|
||||
// The gate should close now, despite there still being a weak_handle.
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("only strong handle is dropped and we shut down per-timeline-state")
|
||||
}
|
||||
}
|
||||
|
||||
// The weak handle still can't be upgraded.
|
||||
weak_handle
|
||||
.upgrade()
|
||||
.err()
|
||||
.expect("still shouldn't be able to upgrade the weak handle");
|
||||
|
||||
// There should be no strong references to the timeline object except the one on "stack".
|
||||
assert_eq!(Arc::strong_count(&shard0), refcount_start);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let (replication_client, connection) = {
|
||||
let mut config = wal_source_connconf.to_tokio_postgres_config();
|
||||
config.application_name("pageserver");
|
||||
config.application_name(format!("pageserver-{}", node.0).as_str());
|
||||
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
|
||||
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
|
||||
Ok(client_and_conn) => client_and_conn?,
|
||||
|
||||
@@ -34,6 +34,8 @@ typedef enum
|
||||
T_NeonGetPageRequest,
|
||||
T_NeonDbSizeRequest,
|
||||
T_NeonGetSlruSegmentRequest,
|
||||
/* future tags above this line */
|
||||
T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */
|
||||
|
||||
/* pagestore -> pagestore_client */
|
||||
T_NeonExistsResponse = 100,
|
||||
@@ -42,6 +44,8 @@ typedef enum
|
||||
T_NeonErrorResponse,
|
||||
T_NeonDbSizeResponse,
|
||||
T_NeonGetSlruSegmentResponse,
|
||||
/* future tags above this line */
|
||||
T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */
|
||||
} NeonMessageTag;
|
||||
|
||||
typedef uint64 NeonRequestId;
|
||||
|
||||
@@ -14,7 +14,7 @@ use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
|
||||
use proxy::auth::{self};
|
||||
use proxy::cancellation::CancellationHandlerMain;
|
||||
use proxy::config::{
|
||||
self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
|
||||
self, obfuscated_proxy_id, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig
|
||||
};
|
||||
use proxy::control_plane::locks::ApiLocks;
|
||||
use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings};
|
||||
@@ -218,6 +218,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy::metrics::CancellationSource::Local,
|
||||
)),
|
||||
endpoint_rate_limiter,
|
||||
obfuscated_proxy_id(std::process::id(), "local"),
|
||||
);
|
||||
|
||||
match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
|
||||
|
||||
@@ -10,6 +10,7 @@ use clap::Arg;
|
||||
use futures::future::Either;
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
use proxy::config::obfuscated_proxy_id;
|
||||
use proxy::context::RequestContext;
|
||||
use proxy::metrics::{Metrics, ThreadPoolMetrics};
|
||||
use proxy::protocol2::ConnectionInfo;
|
||||
@@ -185,6 +186,7 @@ async fn task_main(
|
||||
},
|
||||
proxy::metrics::Protocol::SniRouter,
|
||||
"sni",
|
||||
obfuscated_proxy_id(std::process::id(), "sni-router"), // just a shim for context
|
||||
);
|
||||
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
|
||||
use proxy::cancellation::{CancelMap, CancellationHandler};
|
||||
use proxy::config::{
|
||||
self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
|
||||
ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
|
||||
ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, obfuscated_proxy_id,
|
||||
};
|
||||
use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
@@ -396,6 +396,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let proxy_id: u16 = obfuscated_proxy_id(std::process::id(), &args.region);
|
||||
|
||||
let cancellation_handler = Arc::new(CancellationHandler::<
|
||||
Option<Arc<Mutex<RedisPublisherClient>>>,
|
||||
>::new(
|
||||
@@ -437,6 +439,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
cancellation_token.clone(),
|
||||
cancellation_handler.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
proxy_id,
|
||||
));
|
||||
}
|
||||
|
||||
@@ -448,6 +451,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
cancellation_token.clone(),
|
||||
cancellation_handler.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
proxy_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -459,6 +463,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
cancellation_handler.clone(),
|
||||
proxy_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,15 +80,24 @@ impl ReportableError for CancelError {
|
||||
|
||||
impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
|
||||
pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
|
||||
pub(crate) fn get_session(self: Arc<Self>, proxy_id: u16) -> Session<P> {
|
||||
// we intentionally generate a random "backend pid" and "secret key" here.
|
||||
// we use the corresponding u64 as an identifier for the
|
||||
// actual endpoint+pid+secret for postgres/pgbouncer.
|
||||
//
|
||||
// if we forwarded the backend_pid from postgres to the client, there would be a lot
|
||||
// of overlap between our computes as most pids are small (~100).
|
||||
|
||||
let key = loop {
|
||||
let key = rand::random();
|
||||
let key_rand: u64 = rand::random::<u64>() & 0x0000_ffff_ffff_ffff;
|
||||
|
||||
let backend_pid = ((proxy_id as u32) << 16) | ((key_rand >> 32) as u32) as u32;
|
||||
let cancel_key = (key_rand as u32) as i32;
|
||||
|
||||
let key = CancelKeyData {
|
||||
backend_pid: (backend_pid as i32),
|
||||
cancel_key,
|
||||
};
|
||||
|
||||
// Random key collisions are unlikely to happen here, but they're still possible,
|
||||
// which is why we have to take care not to rewrite an existing key.
|
||||
@@ -451,7 +460,7 @@ mod tests {
|
||||
CancellationSource::FromRedis,
|
||||
));
|
||||
|
||||
let session = cancellation_handler.clone().get_session();
|
||||
let session = cancellation_handler.clone().get_session(123);
|
||||
assert!(cancellation_handler.contains(&session));
|
||||
drop(session);
|
||||
// Check that the session has been dropped.
|
||||
|
||||
@@ -16,6 +16,8 @@ use crate::serverless::GlobalConnPoolOptions;
|
||||
pub use crate::tls::server_config::{configure_tls, TlsConfig};
|
||||
use crate::types::Host;
|
||||
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
@@ -416,6 +418,23 @@ impl FromStr for ConcurrencyLockOptions {
|
||||
}
|
||||
}
|
||||
|
||||
fn map_u32_to_u8(value: u32) -> u8 {
|
||||
((value * 31 + 17) % 255) as u8
|
||||
}
|
||||
|
||||
pub fn obfuscated_proxy_id(process_id: u32, region_id: &str) -> u16 {
|
||||
let process_id = map_u32_to_u8(process_id);
|
||||
let hash_region_id = Sha256::digest(region_id.as_bytes());
|
||||
|
||||
const BASE: u64 = 257;
|
||||
let combined_region = hash_region_id.iter().enumerate().fold(0u64, |acc, (i, &byte)| {
|
||||
(acc + (byte as u64 * BASE.pow(i as u32))) % 255
|
||||
});
|
||||
|
||||
let combined_region = (combined_region % 255) as u8;
|
||||
((combined_region as u16) * 257 + (process_id as u16)) % 65535
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -511,4 +530,12 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proxy_id_obfuscation() {
|
||||
let process_id = 123;
|
||||
let region_id = "us-west-2";
|
||||
let proxy_id = obfuscated_proxy_id(process_id, region_id);
|
||||
assert_eq!(proxy_id, 0x1f7b);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ pub async fn task_main(
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
proxy_id: u16,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("proxy has shut down");
|
||||
@@ -89,6 +90,7 @@ pub async fn task_main(
|
||||
peer_addr,
|
||||
crate::metrics::Protocol::Tcp,
|
||||
&config.region,
|
||||
proxy_id,
|
||||
);
|
||||
|
||||
let res = handle_client(
|
||||
@@ -222,7 +224,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
node.cancel_closure
|
||||
.set_ip_allowlist(ip_allowlist.unwrap_or_default());
|
||||
let session = cancellation_handler.get_session();
|
||||
let session = cancellation_handler.get_session(ctx.proxy_id());
|
||||
prepare_client_connection(&node, &session, &mut stream).await?;
|
||||
|
||||
// Before proxy passing, forward to compute whatever data is left in the
|
||||
|
||||
@@ -46,6 +46,7 @@ struct RequestContextInner {
|
||||
pub(crate) protocol: Protocol,
|
||||
first_packet: chrono::DateTime<Utc>,
|
||||
region: &'static str,
|
||||
pub(crate) proxy_id: u16, // for generating cancel keys per region/process
|
||||
pub(crate) span: Span,
|
||||
|
||||
// filled in as they are discovered
|
||||
@@ -92,6 +93,7 @@ impl Clone for RequestContext {
|
||||
protocol: inner.protocol,
|
||||
first_packet: inner.first_packet,
|
||||
region: inner.region,
|
||||
proxy_id: inner.proxy_id,
|
||||
span: info_span!("background_task"),
|
||||
|
||||
project: inner.project,
|
||||
@@ -124,6 +126,7 @@ impl RequestContext {
|
||||
conn_info: ConnectionInfo,
|
||||
protocol: Protocol,
|
||||
region: &'static str,
|
||||
proxy_id: u16,
|
||||
) -> Self {
|
||||
// TODO: be careful with long lived spans
|
||||
let span = info_span!(
|
||||
@@ -141,6 +144,7 @@ impl RequestContext {
|
||||
protocol,
|
||||
first_packet: Utc::now(),
|
||||
region,
|
||||
proxy_id,
|
||||
span,
|
||||
|
||||
project: None,
|
||||
@@ -172,7 +176,7 @@ impl RequestContext {
|
||||
let ip = IpAddr::from([127, 0, 0, 1]);
|
||||
let addr = SocketAddr::new(ip, 5432);
|
||||
let conn_info = ConnectionInfo { addr, extra: None };
|
||||
RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
|
||||
RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test", 1)
|
||||
}
|
||||
|
||||
pub(crate) fn console_application_name(&self) -> String {
|
||||
@@ -334,6 +338,10 @@ impl RequestContext {
|
||||
.latency_timer
|
||||
.success();
|
||||
}
|
||||
|
||||
pub(crate) fn proxy_id(&self) -> u16 {
|
||||
self.0.try_lock().expect("should not deadlock").proxy_id
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LatencyTimerPause<'a> {
|
||||
|
||||
@@ -59,6 +59,7 @@ pub async fn task_main(
|
||||
cancellation_token: CancellationToken,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
proxy_id: u16,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("proxy has shut down");
|
||||
@@ -124,6 +125,7 @@ pub async fn task_main(
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Tcp,
|
||||
&config.region,
|
||||
proxy_id,
|
||||
);
|
||||
|
||||
let res = handle_client(
|
||||
@@ -358,7 +360,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
node.cancel_closure
|
||||
.set_ip_allowlist(ip_allowlist.unwrap_or_default());
|
||||
let session = cancellation_handler.get_session();
|
||||
let session = cancellation_handler.get_session(ctx.proxy_id());
|
||||
prepare_client_connection(&node, &session, &mut stream).await?;
|
||||
|
||||
// Before proxy passing, forward to compute whatever data is left in the
|
||||
|
||||
@@ -63,6 +63,7 @@ pub async fn task_main(
|
||||
cancellation_token: CancellationToken,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
proxy_id: u16,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("websocket server has shut down");
|
||||
@@ -198,6 +199,7 @@ pub async fn task_main(
|
||||
conn,
|
||||
conn_info,
|
||||
session_id,
|
||||
proxy_id,
|
||||
))
|
||||
.await;
|
||||
}
|
||||
@@ -324,6 +326,7 @@ async fn connection_handler(
|
||||
conn: AsyncRW,
|
||||
conn_info: ConnectionInfo,
|
||||
session_id: uuid::Uuid,
|
||||
proxy_id: u16,
|
||||
) {
|
||||
let session_id = AtomicTake::new(session_id);
|
||||
|
||||
@@ -371,6 +374,7 @@ async fn connection_handler(
|
||||
http_request_token,
|
||||
endpoint_rate_limiter.clone(),
|
||||
cancellations,
|
||||
proxy_id,
|
||||
)
|
||||
.in_current_span()
|
||||
.map_ok_or_else(api_error_into_response, |r| r),
|
||||
@@ -419,6 +423,7 @@ async fn request_handler(
|
||||
http_cancellation_token: CancellationToken,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellations: TaskTracker,
|
||||
proxy_id: u16,
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
|
||||
let host = request
|
||||
.headers()
|
||||
@@ -436,6 +441,7 @@ async fn request_handler(
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Ws,
|
||||
&config.region,
|
||||
proxy_id,
|
||||
);
|
||||
|
||||
let span = ctx.span();
|
||||
@@ -473,6 +479,7 @@ async fn request_handler(
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Http,
|
||||
&config.region,
|
||||
proxy_id,
|
||||
);
|
||||
let span = ctx.span();
|
||||
|
||||
|
||||
@@ -102,6 +102,11 @@ impl Client {
|
||||
self.get(&uri).await
|
||||
}
|
||||
|
||||
pub async fn utilization(&self) -> Result<reqwest::Response> {
|
||||
let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
|
||||
self.get(&uri).await
|
||||
}
|
||||
|
||||
async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
|
||||
self.request(Method::GET, uri, ()).await
|
||||
}
|
||||
|
||||
@@ -52,16 +52,70 @@ pub struct SafekeeperPostgresHandler {
|
||||
|
||||
/// Parsed Postgres command.
|
||||
enum SafekeeperPostgresCommand {
|
||||
StartWalPush,
|
||||
StartReplication { start_lsn: Lsn, term: Option<Term> },
|
||||
StartWalPush {
|
||||
proto_version: u32,
|
||||
// Eventually timelines will be always created explicitly by storcon.
|
||||
// This option allows legacy behaviour for compute to do that until we
|
||||
// fully migrate.
|
||||
allow_timeline_creation: bool,
|
||||
},
|
||||
StartReplication {
|
||||
start_lsn: Lsn,
|
||||
term: Option<Term>,
|
||||
},
|
||||
IdentifySystem,
|
||||
TimelineStatus,
|
||||
JSONCtrl { cmd: AppendLogicalMessage },
|
||||
JSONCtrl {
|
||||
cmd: AppendLogicalMessage,
|
||||
},
|
||||
}
|
||||
|
||||
fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
if cmd.starts_with("START_WAL_PUSH") {
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
// Allow additional options in postgres START_REPLICATION style like
|
||||
// START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false').
|
||||
// Parsing here is very naive and breaks in case of commas or
|
||||
// whitespaces in values, but enough for our purposes.
|
||||
let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap();
|
||||
let caps = re
|
||||
.captures(cmd)
|
||||
.context(format!("failed to parse START_WAL_PUSH command {}", cmd))?;
|
||||
// capture () content
|
||||
let options = caps.get(2).map(|m| m.as_str()).unwrap_or("");
|
||||
// default values
|
||||
let mut proto_version = 2;
|
||||
let mut allow_timeline_creation = true;
|
||||
for kvstr in options.split(",") {
|
||||
if kvstr.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let mut kvit = kvstr.split_whitespace();
|
||||
let key = kvit.next().context(format!(
|
||||
"failed to parse key in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value = kvit.next().context(format!(
|
||||
"failed to parse value in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value_trimmed = value.trim_matches('\'');
|
||||
if key == "proto_version" {
|
||||
proto_version = value_trimmed.parse::<u32>().context(format!(
|
||||
"failed to parse proto_version value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
if key == "allow_timeline_creation" {
|
||||
allow_timeline_creation = value_trimmed.parse::<bool>().context(format!(
|
||||
"failed to parse allow_timeline_creation value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
}
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
})
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
let re = Regex::new(
|
||||
// We follow postgres START_REPLICATION LOGICAL options to pass term.
|
||||
@@ -95,7 +149,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
|
||||
fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
|
||||
SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
|
||||
SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
|
||||
@@ -293,8 +347,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush => {
|
||||
self.handle_start_wal_push(pgb)
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation)
|
||||
.instrument(info_span!("WAL receiver"))
|
||||
.await
|
||||
}
|
||||
@@ -467,3 +524,39 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SafekeeperPostgresCommand;
|
||||
|
||||
/// Test parsing of START_WAL_PUSH command
|
||||
#[test]
|
||||
fn test_start_wal_push_parse() {
|
||||
let cmd = "START_WAL_PUSH";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 2);
|
||||
assert!(allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
|
||||
let cmd =
|
||||
"START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 3);
|
||||
assert!(!allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,6 +127,13 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let utilization = global_timelines.get_timeline_counts();
|
||||
json_response(StatusCode::OK, utilization)
|
||||
}
|
||||
|
||||
/// List all (not deleted) timelines.
|
||||
/// Note: it is possible to do the same with debug_dump.
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -620,6 +627,7 @@ pub fn make_router(
|
||||
failpoints_handler(r, cancel).await
|
||||
})
|
||||
})
|
||||
.get("/v1/uzilization", |r| request_span(r, utilization_handler))
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_handler)
|
||||
})
|
||||
|
||||
@@ -200,9 +200,14 @@ impl SafekeeperPostgresHandler {
|
||||
pub async fn handle_start_wal_push<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), QueryError> {
|
||||
let mut tli: Option<WalResidentTimeline> = None;
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
|
||||
if let Err(end) = self
|
||||
.handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation)
|
||||
.await
|
||||
{
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
let handle_end_fut = pgb.handle_copy_stream_end(end);
|
||||
// If we managed to create the timeline, augment logging with current LSNs etc.
|
||||
@@ -222,6 +227,8 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tli: &mut Option<WalResidentTimeline>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// The `tli` parameter is only used for passing _out_ a timeline, one should
|
||||
// not have been passed in.
|
||||
@@ -250,12 +257,17 @@ impl SafekeeperPostgresHandler {
|
||||
conn_id: self.conn_id,
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
proto_version,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
global_timelines: self.global_timelines.clone(),
|
||||
};
|
||||
|
||||
// Read first message and create timeline if needed.
|
||||
let res = network_reader.read_first_message().await;
|
||||
// Read first message and create timeline if needed and allowed. This
|
||||
// won't be when timelines will be always created by storcon and
|
||||
// allow_timeline_creation becomes false.
|
||||
let res = network_reader
|
||||
.read_first_message(allow_timeline_creation)
|
||||
.await;
|
||||
|
||||
let network_res = if let Ok((timeline, next_msg)) = res {
|
||||
let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
|
||||
@@ -313,6 +325,7 @@ struct NetworkReader<'a, IO> {
|
||||
conn_id: ConnectionId,
|
||||
pgb_reader: &'a mut PostgresBackendReader<IO>,
|
||||
peer_addr: SocketAddr,
|
||||
proto_version: u32,
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
@@ -322,9 +335,10 @@ struct NetworkReader<'a, IO> {
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
async fn read_first_message(
|
||||
&mut self,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let next_msg = read_message(self.pgb_reader, self.proto_version).await?;
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
@@ -336,17 +350,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.create(
|
||||
self.ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
let tli = if allow_timeline_creation {
|
||||
self.global_timelines
|
||||
.create(
|
||||
self.ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.context("create timeline")?
|
||||
} else {
|
||||
self.global_timelines
|
||||
.get(self.ttid)
|
||||
.context("get timeline")?
|
||||
};
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
_ => {
|
||||
@@ -375,7 +394,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
));
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg).await
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -383,9 +402,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
/// TODO: Return Ok(None) on graceful termination.
|
||||
async fn read_message<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
proto_version: u32,
|
||||
) -> Result<ProposerAcceptorMessage, CopyStreamHandlerEnd> {
|
||||
let copy_data = pgb_reader.read_copy_message().await?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?;
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
@@ -393,6 +413,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
mut next_msg: ProposerAcceptorMessage,
|
||||
proto_version: u32,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
/// Threshold for logging slow WalAcceptor sends.
|
||||
const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
|
||||
@@ -425,7 +446,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
|
||||
WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
|
||||
|
||||
next_msg = read_message(pgb_reader).await?;
|
||||
next_msg = read_message(pgb_reader, proto_version).await?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
pub const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -317,7 +317,14 @@ pub enum ProposerAcceptorMessage {
|
||||
|
||||
impl ProposerAcceptorMessage {
|
||||
/// Parse proposer message.
|
||||
pub fn parse(msg_bytes: Bytes) -> Result<ProposerAcceptorMessage> {
|
||||
pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
|
||||
if proto_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"incompatible protocol version {}, expected {}",
|
||||
proto_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
|
||||
@@ -13,6 +13,7 @@ use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::SafekeeperUtilization;
|
||||
use safekeeper_api::ServerInfo;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -416,6 +417,20 @@ impl GlobalTimelines {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Returns statistics about timeline counts
|
||||
pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
|
||||
let global_lock = self.state.lock().unwrap();
|
||||
let timeline_count = global_lock
|
||||
.timelines
|
||||
.values()
|
||||
.filter(|t| match t {
|
||||
GlobalMapTimeline::CreationInProgress => false,
|
||||
GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
|
||||
})
|
||||
.count() as u64;
|
||||
SafekeeperUtilization { timeline_count }
|
||||
}
|
||||
|
||||
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
|
||||
/// and that's why it can return cancelled timelines, to retry deleting them.
|
||||
fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
|
||||
|
||||
@@ -15,7 +15,9 @@ use desim::{
|
||||
};
|
||||
use http::Uri;
|
||||
use safekeeper::{
|
||||
safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
|
||||
safekeeper::{
|
||||
ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
|
||||
},
|
||||
state::{TimelinePersistentState, TimelineState},
|
||||
timeline::TimelineError,
|
||||
wal_storage::Storage,
|
||||
@@ -285,7 +287,7 @@ impl ConnState {
|
||||
bail!("finished processing START_REPLICATION")
|
||||
}
|
||||
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
|
||||
debug!("got msg: {:?}", msg);
|
||||
self.process(msg, global)
|
||||
} else {
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled';
|
||||
UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause';
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause';
|
||||
UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled';
|
||||
@@ -5411,6 +5411,15 @@ impl Service {
|
||||
|
||||
expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
|
||||
|
||||
// Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig`
|
||||
// definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new
|
||||
// fields are added, before doing a comparison.
|
||||
for tsp in &mut persistent_shards {
|
||||
let config: TenantConfig = serde_json::from_str(&tsp.config)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible");
|
||||
}
|
||||
|
||||
if persistent_shards != expect_shards {
|
||||
tracing::error!("Consistency check failed on shards.");
|
||||
|
||||
@@ -7270,19 +7279,14 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
|
||||
/// 1. The node should be filled until it reaches the expected cluster average of
|
||||
/// attached shards. If there are not enough secondaries on the node, the plan stops early.
|
||||
/// 2. Select tenant shards to promote such that the number of attached shards is balanced
|
||||
/// throughout the cluster. We achieve this by picking tenant shards from each node,
|
||||
/// starting from the ones with the largest number of attached shards, until the node
|
||||
/// reaches the expected cluster average.
|
||||
/// 3. Avoid promoting more shards of the same tenant than required. The upper bound
|
||||
/// for the number of tenants from the same shard promoted to the node being filled is:
|
||||
/// shard count for the tenant divided by the number of nodes in the cluster.
|
||||
/// Create a node fill plan (pick secondaries to promote), based on:
|
||||
/// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
|
||||
/// outside their home AZ, should be migrated back here.
|
||||
/// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of
|
||||
/// attached shards, we will promote more shards from the nodes with the most attached shards, unless
|
||||
/// those shards have a home AZ that doesn't match the node we're filling.
|
||||
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
|
||||
let node_az = nodes
|
||||
@@ -7291,53 +7295,79 @@ impl Service {
|
||||
.get_availability_zone_id()
|
||||
.clone();
|
||||
|
||||
let mut tids_by_node = tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(tid, tenant_shard)| {
|
||||
if !matches!(
|
||||
tenant_shard.get_scheduling_policy(),
|
||||
ShardSchedulingPolicy::Active
|
||||
) {
|
||||
// Only include tenants in fills if they have a normal (Active) scheduling policy. We
|
||||
// even exclude Essential, because moving to fill a node is not essential to keeping this
|
||||
// tenant available.
|
||||
return None;
|
||||
}
|
||||
// The tenant shard IDs that we plan to promote from secondary to attached on this node
|
||||
let mut plan = Vec::new();
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved.
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it is
|
||||
// elegible to be moved: fall through;
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling:
|
||||
// do not include it in the fill plan.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
// Collect shards which do not have a preferred AZ & are elegible for moving in stage 2
|
||||
let mut free_tids_by_node: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
|
||||
if tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
// Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could
|
||||
// conceivably come up in real life if deploying a single-AZ region intentionally.
|
||||
let respect_azs = nodes
|
||||
.values()
|
||||
.map(|n| n.get_availability_zone_id())
|
||||
.unique()
|
||||
.count()
|
||||
> 1;
|
||||
|
||||
// Step 1: collect all shards that we are required to migrate back to this node because their AZ preference
|
||||
// requires it.
|
||||
for (tsid, tenant_shard) in tenants {
|
||||
if !tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
// Shard doesn't have a secondary on this node, ignore it.
|
||||
continue;
|
||||
}
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
_ if !respect_azs => {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
return Some((*primary, *tid));
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved, but we
|
||||
// will only do so if our target shard count requires it.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
free_tids_by_node.entry(*primary).or_default().push(*tsid);
|
||||
}
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it should
|
||||
// be moved back to this node as part of filling, unless its currently
|
||||
// attached location is also in its home AZ.
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
if nodes
|
||||
.get(primary)
|
||||
.expect("referenced node must exist")
|
||||
.get_availability_zone_id()
|
||||
!= tenant_shard
|
||||
.preferred_az()
|
||||
.expect("tenant must have an AZ preference")
|
||||
{
|
||||
plan.push(*tsid)
|
||||
}
|
||||
} else {
|
||||
plan.push(*tsid)
|
||||
}
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling,
|
||||
// it may not be moved back to this node as part of filling. Ignore it
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
})
|
||||
.into_group_map();
|
||||
// Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
|
||||
let expected_attached = locked.scheduler.expected_attached_shard_count();
|
||||
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
|
||||
|
||||
let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
|
||||
let mut plan = Vec::new();
|
||||
|
||||
for (node_id, attached) in nodes_by_load {
|
||||
let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
|
||||
@@ -7346,7 +7376,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if plan.len() >= fill_requirement
|
||||
|| tids_by_node.is_empty()
|
||||
|| free_tids_by_node.is_empty()
|
||||
|| attached <= expected_attached
|
||||
{
|
||||
break;
|
||||
@@ -7358,7 +7388,7 @@ impl Service {
|
||||
|
||||
let mut remove_node = false;
|
||||
while take > 0 {
|
||||
match tids_by_node.get_mut(&node_id) {
|
||||
match free_tids_by_node.get_mut(&node_id) {
|
||||
Some(tids) => match tids.pop() {
|
||||
Some(tid) => {
|
||||
let max_promote_for_tenant = std::cmp::max(
|
||||
@@ -7384,7 +7414,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if remove_node {
|
||||
tids_by_node.remove(&node_id);
|
||||
free_tids_by_node.remove(&node_id);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::NodeId;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use super::Service;
|
||||
use super::{Node, Scheduler, Service, TenantShard};
|
||||
|
||||
pub struct ChaosInjector {
|
||||
service: Arc<Service>,
|
||||
@@ -35,50 +41,86 @@ impl ChaosInjector {
|
||||
}
|
||||
}
|
||||
|
||||
/// If a shard has a secondary and attached location, then re-assign the secondary to be
|
||||
/// attached and the attached to be secondary.
|
||||
///
|
||||
/// Only modifies tenants if they're in Active scheduling policy.
|
||||
fn maybe_migrate_to_secondary(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
nodes: &Arc<HashMap<NodeId, Node>>,
|
||||
tenants: &mut BTreeMap<TenantShardId, TenantShard>,
|
||||
scheduler: &mut Scheduler,
|
||||
) {
|
||||
let shard = tenants
|
||||
.get_mut(&tenant_shard_id)
|
||||
.expect("Held lock between choosing ID and this get");
|
||||
|
||||
if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
|
||||
// Skip non-active scheduling policies, so that a shard with a policy like Pause can
|
||||
// be pinned without being disrupted by us.
|
||||
tracing::info!(
|
||||
"Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
|
||||
shard.get_scheduling_policy()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Pick a secondary to promote
|
||||
let Some(new_location) = shard
|
||||
.intent
|
||||
.get_secondary()
|
||||
.choose(&mut thread_rng())
|
||||
.cloned()
|
||||
else {
|
||||
tracing::info!(
|
||||
"Skipping shard {tenant_shard_id}: no secondary location, can't migrate"
|
||||
);
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(old_location) = *shard.intent.get_attached() else {
|
||||
tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location");
|
||||
return;
|
||||
};
|
||||
|
||||
tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}");
|
||||
|
||||
shard.intent.demote_attached(scheduler, old_location);
|
||||
shard.intent.promote_attached(scheduler, new_location);
|
||||
self.service.maybe_reconcile_shard(shard, nodes);
|
||||
}
|
||||
|
||||
async fn inject_chaos(&mut self) {
|
||||
// Pick some shards to interfere with
|
||||
let batch_size = 128;
|
||||
let mut inner = self.service.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = inner.parts_mut();
|
||||
let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
|
||||
let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
|
||||
|
||||
for victim in victims {
|
||||
let shard = tenants
|
||||
.get_mut(victim)
|
||||
.expect("Held lock between choosing ID and this get");
|
||||
|
||||
if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
|
||||
// Skip non-active scheduling policies, so that a shard with a policy like Pause can
|
||||
// be pinned without being disrupted by us.
|
||||
tracing::info!(
|
||||
"Skipping shard {victim}: scheduling policy is {:?}",
|
||||
shard.get_scheduling_policy()
|
||||
);
|
||||
continue;
|
||||
// Prefer to migrate tenants that are currently outside their home AZ. This avoids the chaos injector
|
||||
// continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
|
||||
// random tenants to move, and then on next chaos iteration moving them back, then picking some new
|
||||
// random tenants on the next iteration.
|
||||
let mut victims = Vec::with_capacity(batch_size);
|
||||
for shard in tenants.values() {
|
||||
if shard.is_attached_outside_preferred_az(nodes) {
|
||||
victims.push(shard.tenant_shard_id);
|
||||
}
|
||||
|
||||
// Pick a secondary to promote
|
||||
let Some(new_location) = shard
|
||||
.intent
|
||||
.get_secondary()
|
||||
.choose(&mut thread_rng())
|
||||
.cloned()
|
||||
else {
|
||||
tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
|
||||
continue;
|
||||
};
|
||||
if victims.len() >= batch_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let Some(old_location) = *shard.intent.get_attached() else {
|
||||
tracing::info!("Skipping shard {victim}: currently has no attached location");
|
||||
continue;
|
||||
};
|
||||
let choose_random = batch_size.saturating_sub(victims.len());
|
||||
tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
|
||||
|
||||
tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
|
||||
let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
|
||||
victims.extend(random_victims);
|
||||
|
||||
shard.intent.demote_attached(scheduler, old_location);
|
||||
shard.intent.promote_attached(scheduler, new_location);
|
||||
self.service.maybe_reconcile_shard(shard, nodes);
|
||||
for victim in victims {
|
||||
self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1793,6 +1793,23 @@ impl TenantShard {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the tenant shard is attached to a node that is outside the preferred AZ.
|
||||
///
|
||||
/// If the shard does not have a preferred AZ, returns false.
|
||||
pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap<NodeId, Node>) -> bool {
|
||||
self.intent
|
||||
.get_attached()
|
||||
.map(|node_id| {
|
||||
Some(
|
||||
nodes
|
||||
.get(&node_id)
|
||||
.expect("referenced node exists")
|
||||
.get_availability_zone_id(),
|
||||
) == self.intent.preferred_az_id.as_ref()
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TenantShard {
|
||||
|
||||
@@ -370,6 +370,7 @@ class NeonEnvBuilder:
|
||||
pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None,
|
||||
num_safekeepers: int = 1,
|
||||
num_pageservers: int = 1,
|
||||
num_azs: int = 1,
|
||||
# Use non-standard SK ids to check for various parsing bugs
|
||||
safekeepers_id_start: int = 0,
|
||||
# fsync is disabled by default to make the tests go faster
|
||||
@@ -401,6 +402,7 @@ class NeonEnvBuilder:
|
||||
self.pageserver_config_override = pageserver_config_override
|
||||
self.num_safekeepers = num_safekeepers
|
||||
self.num_pageservers = num_pageservers
|
||||
self.num_azs = num_azs
|
||||
self.safekeepers_id_start = safekeepers_id_start
|
||||
self.safekeepers_enable_fsync = safekeepers_enable_fsync
|
||||
self.auth_enabled = auth_enabled
|
||||
@@ -990,6 +992,7 @@ class NeonEnv:
|
||||
self.endpoints = EndpointFactory(self)
|
||||
self.safekeepers: list[Safekeeper] = []
|
||||
self.pageservers: list[NeonPageserver] = []
|
||||
self.num_azs = config.num_azs
|
||||
self.broker = NeonBroker(self)
|
||||
self.pageserver_remote_storage = config.pageserver_remote_storage
|
||||
self.safekeepers_remote_storage = config.safekeepers_remote_storage
|
||||
@@ -1090,14 +1093,21 @@ class NeonEnv:
|
||||
http=self.port_distributor.get_port(),
|
||||
)
|
||||
|
||||
# Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override`
|
||||
if self.num_azs > 1:
|
||||
# Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc.
|
||||
az_prefix = DEFAULT_AZ_ID[:-1]
|
||||
availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}"
|
||||
else:
|
||||
availability_zone = DEFAULT_AZ_ID
|
||||
|
||||
ps_cfg: dict[str, Any] = {
|
||||
"id": ps_id,
|
||||
"listen_pg_addr": f"localhost:{pageserver_port.pg}",
|
||||
"listen_http_addr": f"localhost:{pageserver_port.http}",
|
||||
"pg_auth_type": pg_auth_type,
|
||||
"http_auth_type": http_auth_type,
|
||||
# Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
|
||||
"availability_zone": DEFAULT_AZ_ID,
|
||||
"availability_zone": availability_zone,
|
||||
# Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
|
||||
# the pageserver taking a long time to start up due to syncfs flushing other tests' data
|
||||
"no_sync": True,
|
||||
|
||||
@@ -4,9 +4,19 @@ import time
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
BTREE_NUM_CYCLEID_PAGES = """
|
||||
WITH raw_pages AS (
|
||||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
|
||||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
|
||||
WITH lsns AS (
|
||||
/*
|
||||
* pg_switch_wal() ensures we have an LSN that
|
||||
* 1. is after any previous modifications, but also,
|
||||
* 2. (critically) is flushed, preventing any issues with waiting for
|
||||
* unflushed WAL in PageServer.
|
||||
*/
|
||||
SELECT pg_switch_wal() as lsn
|
||||
),
|
||||
raw_pages AS (
|
||||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page
|
||||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno,
|
||||
lsns l(lsn)
|
||||
),
|
||||
parsed_pages AS (
|
||||
/* cycle ID is the last 2 bytes of the btree page */
|
||||
@@ -36,7 +46,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
|
||||
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
|
||||
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert (
|
||||
@@ -57,7 +66,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
ses1.execute("DELETE FROM t WHERE id <= 610;")
|
||||
|
||||
# Flush wal, for checking purposes
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
|
||||
@@ -108,8 +116,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
# unpin the btree page, allowing s3's vacuum to complete
|
||||
ses2.execute("FETCH ALL FROM foo;")
|
||||
ses2.execute("ROLLBACK;")
|
||||
# flush WAL to make sure PS is up-to-date
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
# check that our expectations are correct
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
# NB: there are benchmarks that double-serve as tests inside the `performance` directory.
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct
|
||||
@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"])
|
||||
def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str):
|
||||
def patch_pageserver_toml(config):
|
||||
config["page_service_pipelining"] = {
|
||||
"mode": "pipelined",
|
||||
"max_batch_size": 32,
|
||||
"execution": "concurrent-futures",
|
||||
}
|
||||
|
||||
neon_env_builder.pageserver_config_override = patch_pageserver_toml
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
log.info("make flush appear slow")
|
||||
|
||||
log.info("sending requests until pageserver accepts no more")
|
||||
# TODO: extract this into a helper, like subprocess_capture,
|
||||
# so that we capture the stderr from the helper somewhere.
|
||||
child = subprocess.Popen(
|
||||
[
|
||||
neon_binpath / "test_helper_slow_client_reads",
|
||||
env.pageserver.connstr(),
|
||||
str(env.initial_tenant),
|
||||
str(env.initial_timeline),
|
||||
],
|
||||
bufsize=0, # unbuffered
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
assert child.stdout is not None
|
||||
buf = child.stdout.read(1)
|
||||
if len(buf) != 1:
|
||||
raise Exception("unexpected EOF")
|
||||
if buf != b"R":
|
||||
raise Exception(f"unexpected data: {buf!r}")
|
||||
log.info("helper reports pageserver accepts no more requests")
|
||||
log.info(
|
||||
"assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace"
|
||||
)
|
||||
|
||||
if kind == "pageserver-stop":
|
||||
log.info("try to shut down the pageserver cleanly")
|
||||
env.pageserver.stop()
|
||||
elif kind == "tenant-detach":
|
||||
log.info("try to shut down the tenant")
|
||||
env.pageserver.tenant_detach(env.initial_tenant)
|
||||
else:
|
||||
raise ValueError(f"unexpected kind: {kind}")
|
||||
|
||||
log.info("shutdown did not time out, test passed")
|
||||
@@ -2394,6 +2394,7 @@ def test_storage_controller_node_deletion(
|
||||
Test that deleting a node works & properly reschedules everything that was on the node.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.num_azs = 3
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
@@ -2407,6 +2408,9 @@ def test_storage_controller_node_deletion(
|
||||
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
|
||||
)
|
||||
|
||||
# Sanity check: initial creations should not leave the system in an unstable scheduling state
|
||||
assert env.storage_controller.reconcile_all() == 0
|
||||
|
||||
victim = env.pageservers[-1]
|
||||
|
||||
# The procedure a human would follow is:
|
||||
@@ -3211,7 +3215,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
# some small tests for the scheduling policy querying and returning APIs
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Disabled"
|
||||
assert newest_info["scheduling_policy"] == "Pause"
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Thread
|
||||
|
||||
import pytest
|
||||
@@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))
|
||||
|
||||
def timeline_create():
|
||||
try:
|
||||
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
|
||||
raise RuntimeError("creation succeeded even though it shouldn't")
|
||||
except ReadTimeout:
|
||||
pass
|
||||
|
||||
Thread(target=timeline_create).start()
|
||||
|
||||
def hit_initdb_upload_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
|
||||
wait_until(hit_initdb_upload_failpoint)
|
||||
|
||||
def creation_connection_timed_out():
|
||||
env.pageserver.assert_log_contains(
|
||||
"POST.*/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
# Wait so that we hit the timeout and the connection is dropped
|
||||
# (But timeline creation still continues)
|
||||
wait_until(creation_connection_timed_out)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
|
||||
raise RuntimeError("creation succeeded even though it shouldn't")
|
||||
|
||||
def tenant_delete():
|
||||
def tenant_delete_inner():
|
||||
@@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
|
||||
|
||||
wait_until(tenant_delete_inner)
|
||||
|
||||
Thread(target=tenant_delete).start()
|
||||
# We will spawn background threads for timeline creation and tenant deletion. They will both
|
||||
# get blocked on our failpoint.
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
create_fut = executor.submit(timeline_create)
|
||||
|
||||
def deletion_arrived():
|
||||
env.pageserver.assert_log_contains(
|
||||
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
|
||||
)
|
||||
def hit_initdb_upload_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
|
||||
wait_until(deletion_arrived)
|
||||
wait_until(hit_initdb_upload_failpoint)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
|
||||
def creation_connection_timed_out():
|
||||
env.pageserver.assert_log_contains(
|
||||
"POST.*/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
# Disable the failpoint and wait for deletion to finish
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
# Wait so that we hit the timeout and the connection is dropped
|
||||
# (But timeline creation still continues)
|
||||
wait_until(creation_connection_timed_out)
|
||||
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
with pytest.raises(ReadTimeout):
|
||||
# Our creation failed from the client's point of view.
|
||||
create_fut.result()
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
|
||||
delete_fut = executor.submit(tenant_delete)
|
||||
|
||||
def deletion_arrived():
|
||||
env.pageserver.assert_log_contains(
|
||||
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
|
||||
)
|
||||
|
||||
wait_until(deletion_arrived)
|
||||
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
|
||||
|
||||
# Disable the failpoint and wait for deletion to finish
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
|
||||
delete_fut.result()
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
|
||||
@@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
io_metrics = query_all_safekeepers(
|
||||
"safekeeper_pg_io_bytes_total",
|
||||
{
|
||||
"app_name": "pageserver",
|
||||
"app_name": f"pageserver-{env.pageserver.id}",
|
||||
"client_az": "test_ps_az",
|
||||
"dir": io_direction,
|
||||
"same_az": "false",
|
||||
|
||||
Reference in New Issue
Block a user