Compare commits

..

4 Commits

Author SHA1 Message Date
Arthur Petukhovsky
f2e98b2b0a Write WAL bytes to data dir 2023-12-28 18:49:51 +00:00
Arthur Petukhovsky
0ba55719b0 Check bytes in segment intersection 2023-12-28 17:55:04 +00:00
Arthur Petukhovsky
b162b4a9cf Fix clap arg 2023-12-28 17:10:05 +00:00
Arthur Petukhovsky
bbc8da687e Add dryrun version of the script 2023-12-28 16:55:21 +00:00
181 changed files with 3859 additions and 9473 deletions

View File

@@ -1,2 +0,0 @@
[profile.default]
slow-timeout = "1m"

View File

@@ -105,11 +105,11 @@ jobs:
- name: Install Python deps
run: ./scripts/pysync
- name: Run `ruff check` to ensure code format
run: poetry run ruff check .
- name: Run ruff to ensure code format
run: poetry run ruff .
- name: Run `ruff format` to ensure code format
run: poetry run ruff format --check .
- name: Run black to ensure code format
run: poetry run black --diff --check .
- name: Run mypy to check types
run: poetry run mypy .
@@ -339,16 +339,16 @@ jobs:
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run rust tests
- name: Run cargo test
run: |
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
@@ -358,7 +358,7 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
- name: Install rust binaries
run: |

220
Cargo.lock generated
View File

@@ -30,8 +30,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
dependencies = [
"cfg-if",
"const-random",
"getrandom 0.2.11",
"once_cell",
"version_check",
"zerocopy",
@@ -52,12 +50,6 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@@ -255,12 +247,6 @@ dependencies = [
"syn 2.0.32",
]
[[package]]
name = "atomic"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
[[package]]
name = "atomic-polyfill"
version = "1.0.2"
@@ -1025,17 +1011,17 @@ dependencies = [
[[package]]
name = "chrono"
version = "0.4.31"
version = "0.4.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-integer",
"num-traits",
"serde",
"wasm-bindgen",
"windows-targets 0.48.0",
"winapi",
]
[[package]]
@@ -1134,20 +1120,6 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
[[package]]
name = "combine"
version = "4.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
dependencies = [
"bytes",
"futures-core",
"memchr",
"pin-project-lite",
"tokio",
"tokio-util",
]
[[package]]
name = "comfy-table"
version = "6.1.4"
@@ -1189,7 +1161,6 @@ dependencies = [
"flate2",
"futures",
"hyper",
"nix 0.26.2",
"notify",
"num_cpus",
"opentelemetry",
@@ -1200,7 +1171,6 @@ dependencies = [
"rust-ini",
"serde",
"serde_json",
"signal-hook",
"tar",
"tokio",
"tokio-postgres",
@@ -2389,6 +2359,19 @@ dependencies = [
"tokio-native-tls",
]
[[package]]
name = "hyper-tungstenite"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
dependencies = [
"hyper",
"pin-project-lite",
"tokio",
"tokio-tungstenite",
"tungstenite",
]
[[package]]
name = "iana-time-zone"
version = "0.1.56"
@@ -2490,12 +2473,6 @@ dependencies = [
"web-sys",
]
[[package]]
name = "integer-encoding"
version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
[[package]]
name = "io-lifetimes"
version = "1.0.11"
@@ -2859,19 +2836,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
dependencies = [
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits",
]
[[package]]
name = "num-bigint"
version = "0.4.3"
@@ -2883,15 +2847,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-complex"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
dependencies = [
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.45"
@@ -2902,28 +2857,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-iter"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.15"
@@ -3146,15 +3079,6 @@ dependencies = [
"tokio-stream",
]
[[package]]
name = "ordered-float"
version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
dependencies = [
"num-traits",
]
[[package]]
name = "ordered-multimap"
version = "0.7.1"
@@ -3198,7 +3122,6 @@ name = "pagebench"
version = "0.1.0"
dependencies = [
"anyhow",
"camino",
"clap",
"futures",
"hdrhistogram",
@@ -3211,7 +3134,6 @@ dependencies = [
"serde",
"serde_json",
"tokio",
"tokio-util",
"tracing",
"utils",
"workspace_hack",
@@ -3415,35 +3337,6 @@ dependencies = [
"windows-targets 0.48.0",
]
[[package]]
name = "parquet"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"ahash",
"bytes",
"chrono",
"hashbrown 0.14.0",
"num",
"num-bigint",
"paste",
"seq-macro",
"thrift",
"twox-hash",
"zstd",
]
[[package]]
name = "parquet_derive"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"parquet",
"proc-macro2",
"quote",
"syn 2.0.32",
]
[[package]]
name = "password-hash"
version = "0.5.0"
@@ -3867,8 +3760,6 @@ dependencies = [
"base64 0.13.1",
"bstr",
"bytes",
"camino",
"camino-tempfile",
"chrono",
"clap",
"consumption_metrics",
@@ -3882,6 +3773,7 @@ dependencies = [
"hostname",
"humantime",
"hyper",
"hyper-tungstenite",
"ipnet",
"itertools",
"md5",
@@ -3890,8 +3782,6 @@ dependencies = [
"once_cell",
"opentelemetry",
"parking_lot 0.12.1",
"parquet",
"parquet_derive",
"pbkdf2",
"pin-project-lite",
"postgres-native-tls",
@@ -3901,9 +3791,7 @@ dependencies = [
"prometheus",
"rand 0.8.5",
"rcgen",
"redis",
"regex",
"remote_storage",
"reqwest",
"reqwest-middleware",
"reqwest-retry",
@@ -3927,13 +3815,11 @@ dependencies = [
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls",
"tokio-tungstenite",
"tokio-util",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-utils",
"tungstenite",
"url",
"utils",
"uuid",
@@ -4066,32 +3952,6 @@ dependencies = [
"yasna",
]
[[package]]
name = "redis"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
dependencies = [
"async-trait",
"bytes",
"combine",
"futures-util",
"itoa",
"percent-encoding",
"pin-project-lite",
"rustls",
"rustls-native-certs",
"rustls-pemfile",
"rustls-webpki 0.101.7",
"ryu",
"sha1_smol",
"socket2 0.4.9",
"tokio",
"tokio-rustls",
"tokio-util",
"url",
]
[[package]]
name = "redox_syscall"
version = "0.2.16"
@@ -4543,14 +4403,12 @@ dependencies = [
"async-stream",
"aws-config",
"aws-sdk-s3",
"aws-smithy-async",
"bincode",
"bytes",
"chrono",
"clap",
"crc32c",
"either",
"futures",
"futures-util",
"hex",
"histogram",
@@ -4589,7 +4447,6 @@ dependencies = [
"clap",
"const_format",
"crc32c",
"fail",
"fs2",
"futures",
"git-version",
@@ -4613,7 +4470,6 @@ dependencies = [
"serde",
"serde_json",
"serde_with",
"sha2",
"signal-hook",
"storage_broker",
"thiserror",
@@ -4820,12 +4676,6 @@ dependencies = [
"uuid",
]
[[package]]
name = "seq-macro"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
[[package]]
name = "serde"
version = "1.0.183"
@@ -4948,12 +4798,6 @@ dependencies = [
"digest",
]
[[package]]
name = "sha1_smol"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
[[package]]
name = "sha2"
version = "0.10.6"
@@ -5352,17 +5196,6 @@ dependencies = [
"once_cell",
]
[[package]]
name = "thrift"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
dependencies = [
"byteorder",
"integer-encoding",
"ordered-float",
]
[[package]]
name = "time"
version = "0.3.21"
@@ -5907,16 +5740,6 @@ dependencies = [
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
dependencies = [
"cfg-if",
"static_assertions",
]
[[package]]
name = "typenum"
version = "1.16.0"
@@ -6055,7 +5878,6 @@ dependencies = [
"chrono",
"const_format",
"criterion",
"fail",
"futures",
"heapless",
"hex",
@@ -6094,11 +5916,10 @@ dependencies = [
[[package]]
name = "uuid"
version = "1.6.1"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
dependencies = [
"atomic",
"getrandom 0.2.11",
"serde",
]
@@ -6594,7 +6415,6 @@ dependencies = [
"num-integer",
"num-traits",
"once_cell",
"parquet",
"prost",
"rand 0.8.5",
"regex",

View File

@@ -89,6 +89,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
@@ -106,14 +107,11 @@ opentelemetry = "0.19.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0"
parking_lot = "0.12"
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
@@ -155,7 +153,6 @@ tokio-rustls = "0.24"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
tokio-tungstenite = "0.20"
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
@@ -163,9 +160,8 @@ tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.19.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
tungstenite = "0.20"
url = "2.2"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
uuid = { version = "1.2", features = ["v4", "serde"] }
walkdir = "2.3.2"
webpki-roots = "0.25"
x509-parser = "0.15"
@@ -219,16 +215,15 @@ tonic-build = "0.9"
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
################# Binary contents sections
[profile.release]
# This is useful for profiling and, to some extent, debug.
# Besides, debug info should not affect the performance.
debug = true
strip = true # Automatically strip symbols from the binary.
opt-level = "z" # Optimize for size.
lto = true
# disable debug symbols for all packages except this one to decrease binaries size
[profile.release.package."*"]

View File

@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.75.0
ENV RUSTC_VERSION=1.74.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -151,7 +151,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
cargo install cargo-hakari && \
cargo install cargo-deny && \
cargo install cargo-hack && \
cargo install cargo-nextest && \
rm -rf /home/nonroot/.cargo/registry && \
rm -rf /home/nonroot/.cargo/git
ENV RUSTC_WRAPPER=cachepot

View File

@@ -13,7 +13,6 @@ clap.workspace = true
flate2.workspace = true
futures.workspace = true
hyper = { workspace = true, features = ["full"] }
nix.workspace = true
notify.workspace = true
num_cpus.workspace = true
opentelemetry.workspace = true
@@ -21,7 +20,6 @@ postgres.workspace = true
regex.workspace = true
serde.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
tar.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }

View File

@@ -40,22 +40,18 @@ use std::collections::HashMap;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::sync::atomic::Ordering;
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use nix::sys::signal::{kill, Signal};
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::http::api::launch_http_server;
@@ -71,13 +67,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
thread::spawn(move || {
for sig in signals.forever() {
handle_exit_signal(sig);
}
});
let build_tag = option_env!("BUILD_TAG")
.unwrap_or(BUILD_TAG_DEFAULT)
.to_string();
@@ -350,20 +339,13 @@ fn main() -> Result<()> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
if let Some((mut pg, logs_handle)) = pg {
if let Some(mut pg) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
PG_PID.store(0, Ordering::SeqCst);
// Process has exited, so we can join the logs thread.
let _ = logs_handle
.join()
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
info!("Postgres exited with code {}, shutting down", ecode);
exit_code = ecode.code()
}
@@ -537,24 +519,6 @@ fn cli() -> clap::Command {
)
}
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
/// wait for termination which would be easy then.
fn handle_exit_signal(sig: i32) {
info!("received {sig} termination signal");
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
kill(pg_pid, Signal::SIGTERM).ok();
}
exit(1);
}
#[test]
fn verify_cli() {
cli().debug_assert()

View File

@@ -6,8 +6,6 @@ use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::atomic::AtomicU32;
use std::sync::atomic::Ordering;
use std::sync::{Condvar, Mutex, RwLock};
use std::thread;
use std::time::Instant;
@@ -31,15 +29,11 @@ use utils::measured_stream::MeasuredReader;
use remote_storage::{DownloadError, RemotePath};
use crate::checker::create_availability_check_data;
use crate::logger::inlinify;
use crate::pg_helpers::*;
use crate::spec::*;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
use crate::{config, extension_server};
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
// Url type maintains proper escaping
@@ -280,7 +274,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
$$;"#,
roles_decl, database_decl,
);
info!("Neon superuser created:\n{}", inlinify(&query));
info!("Neon superuser created:\n{}", &query);
client
.simple_query(&query)
.map_err(|e| anyhow::anyhow!(e).context(query))?;
@@ -496,7 +490,7 @@ impl ComputeNode {
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
let start_time = Utc::now();
let mut sync_handle = maybe_cgexec(&self.pgbin)
let sync_handle = maybe_cgexec(&self.pgbin)
.args(["--sync-safekeepers"])
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
.envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -505,29 +499,15 @@ impl ComputeNode {
vec![]
})
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("postgres --sync-safekeepers failed to start");
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
// `postgres --sync-safekeepers` will print all log output to stderr and
// final LSN to stdout. So we leave stdout to collect LSN, while stderr logs
// will be collected in a child thread.
let stderr = sync_handle
.stderr
.take()
.expect("stderr should be captured");
let logs_handle = handle_postgres_logs(stderr);
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
// redirected to the caller output.
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
// Process has exited, so we can join the logs thread.
let _ = logs_handle
.join()
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
if !sync_output.status.success() {
anyhow::bail!(
@@ -665,12 +645,11 @@ impl ComputeNode {
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
/// Returns a handle to the child process and a handle to the logs thread.
#[instrument(skip_all)]
pub fn start_postgres(
&self,
storage_auth_token: Option<String>,
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
) -> Result<std::process::Child> {
let pgdata_path = Path::new(&self.pgdata);
// Run postgres as a child process.
@@ -681,18 +660,12 @@ impl ComputeNode {
} else {
vec![]
})
.stderr(Stdio::piped())
.spawn()
.expect("cannot start postgres process");
PG_PID.store(pg.id(), Ordering::SeqCst);
// Start a thread to collect logs from stderr.
let stderr = pg.stderr.take().expect("stderr should be captured");
let logs_handle = handle_postgres_logs(stderr);
wait_for_postgres(&mut pg, pgdata_path)?;
Ok((pg, logs_handle))
Ok(pg)
}
/// Do initial configuration of the already started Postgres.
@@ -837,10 +810,7 @@ impl ComputeNode {
}
#[instrument(skip_all)]
pub fn start_compute(
&self,
extension_server_port: u16,
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
@@ -911,7 +881,7 @@ impl ComputeNode {
self.prepare_pgdata(&compute_state, extension_server_port)?;
let start_time = Utc::now();
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
let config_time = Utc::now();
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
@@ -961,7 +931,7 @@ impl ComputeNode {
};
info!(?metrics, "compute start finished");
Ok(pg_process)
Ok(pg)
}
// Look for core dumps and collect backtraces.

View File

@@ -38,9 +38,3 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
Ok(())
}
/// Replace all newline characters with a special character to make it
/// easier to grep for log messages.
pub fn inlinify(s: &str) -> String {
s.replace('\n', "\u{200B}")
}

View File

@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
use chrono::{DateTime, Utc};
use postgres::{Client, NoTls};
use tracing::{debug, info, warn};
use tracing::{debug, info};
use crate::compute::ComputeNode;
@@ -84,29 +84,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
}
// If there are existing (logical) walsenders, do not suspend.
//
// walproposer doesn't currently show up in pg_stat_replication,
// but protect if it will be
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
match cli.query_one(ws_count_query, &[]) {
Ok(r) => match r.try_get::<&str, i64>("count") {
Ok(num_ws) => {
if num_ws > 0 {
last_active = Some(Utc::now());
}
}
Err(e) => {
warn!("failed to parse ws count: {:?}", e);
continue;
}
},
Err(e) => {
warn!("failed to get list of walsenders: {:?}", e);
continue;
}
}
// Update the last activity in the shared state if we got a more recent one.
let mut state = compute.state.lock().unwrap();
// NB: `Some(<DateTime>)` is always greater than `None`.

View File

@@ -6,15 +6,12 @@ use std::io::{BufRead, BufReader};
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::Child;
use std::thread::JoinHandle;
use std::time::{Duration, Instant};
use anyhow::{bail, Result};
use ini::Ini;
use notify::{RecursiveMode, Watcher};
use postgres::{Client, Transaction};
use tokio::io::AsyncBufReadExt;
use tokio::time::timeout;
use tokio_postgres::NoTls;
use tracing::{debug, error, info, instrument};
@@ -429,72 +426,3 @@ pub async fn tune_pgbouncer(
Ok(())
}
/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
/// and send them to the logger. In the future we may also want to add context to
/// these logs.
pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
std::thread::spawn(move || {
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to build tokio runtime");
let res = runtime.block_on(async move {
let stderr = tokio::process::ChildStderr::from_std(stderr)?;
handle_postgres_logs_async(stderr).await
});
if let Err(e) = res {
tracing::error!("error while processing postgres logs: {}", e);
}
})
}
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
/// - next line starts with timestamp
/// - EOF
/// - no new lines were written for the last second
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
let mut lines = tokio::io::BufReader::new(stderr).lines();
let timeout_duration = Duration::from_secs(1);
let ts_regex =
regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");
let mut buf = vec![];
loop {
let next_line = timeout(timeout_duration, lines.next_line()).await;
// we should flush lines from the buffer if we cannot continue reading multiline message
let should_flush_buf = match next_line {
// Flushing if new line starts with timestamp
Ok(Ok(Some(ref line))) => ts_regex.is_match(line),
// Flushing on EOF, timeout or error
_ => true,
};
if !buf.is_empty() && should_flush_buf {
// join multiline message into a single line, separated by unicode Zero Width Space.
// "PG:" suffix is used to distinguish postgres logs from other logs.
let combined = format!("PG:{}\n", buf.join("\u{200B}"));
buf.clear();
// sync write to stderr to avoid interleaving with other logs
use std::io::Write;
let res = std::io::stderr().lock().write_all(combined.as_bytes());
if let Err(e) = res {
tracing::error!("error while writing to stderr: {}", e);
}
}
// if not timeout, append line to the buffer
if next_line.is_ok() {
match next_line?? {
Some(line) => buf.push(line),
// EOF
None => break,
};
}
}
Ok(())
}

View File

@@ -9,7 +9,6 @@ use reqwest::StatusCode;
use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
use crate::config;
use crate::logger::inlinify;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
@@ -663,11 +662,7 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
$$;"
.to_string();
info!(
"grant query for db {} : {}",
&db.name,
inlinify(&grant_query)
);
info!("grant query for db {} : {}", &db.name, &grant_query);
db_client.simple_query(&grant_query)?;
}

View File

@@ -6,11 +6,11 @@
//! rely on `neon_local` to set up the environment for each test.
//!
use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use compute_api::spec::ComputeMode;
use control_plane::attachment_service::AttachmentService;
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::{InitForceMode, LocalEnv};
use control_plane::local_env::LocalEnv;
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::tenant_migration::migrate_tenant;
@@ -338,7 +338,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let mut env =
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
let force = init_match.get_one("force").expect("we set a default value");
let force = init_match.get_flag("force");
env.init(pg_version, force)
.context("Failed to initialize neon repository")?;
@@ -1266,15 +1266,9 @@ fn cli() -> Command {
.required(false);
let force_arg = Arg::new("force")
.value_parser(value_parser!(InitForceMode))
.value_parser(value_parser!(bool))
.long("force")
.default_value(
InitForceMode::MustNotExist
.to_possible_value()
.unwrap()
.get_name()
.to_owned(),
)
.action(ArgAction::SetTrue)
.help("Force initialization even if the repository is not empty")
.required(false);

View File

@@ -46,8 +46,6 @@ use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::RemoteExtSpec;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId, TimelineId};
@@ -441,14 +439,11 @@ impl Endpoint {
Ok(())
}
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
// TODO use background_process::stop_process instead
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
if send_sigterm {
kill(pid, Signal::SIGTERM).ok();
}
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
Ok(())
}
@@ -738,15 +733,10 @@ impl Endpoint {
&None,
)?;
// Also wait for the compute_ctl process to die. It might have some
// cleanup work to do after postgres stops, like syncing safekeepers,
// etc.
// Also wait for the compute_ctl process to die. It might have some cleanup
// work to do after postgres stops, like syncing safekeepers, etc.
//
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
// want this cleanup: tests intentionally do stop when majority of
// safekeepers is down, so sync-safekeepers would hang otherwise. This
// could be a separate flag though.
self.wait_for_compute_ctl_to_exit(destroy)?;
self.wait_for_compute_ctl_to_exit()?;
if destroy {
println!(
"Destroying postgres data directory '{}'",

View File

@@ -5,7 +5,6 @@
use anyhow::{bail, ensure, Context};
use clap::ValueEnum;
use postgres_backend::AuthType;
use reqwest::Url;
use serde::{Deserialize, Serialize};
@@ -163,31 +162,6 @@ impl Default for SafekeeperConf {
}
}
#[derive(Clone, Copy)]
pub enum InitForceMode {
MustNotExist,
EmptyDirOk,
RemoveAllContents,
}
impl ValueEnum for InitForceMode {
fn value_variants<'a>() -> &'a [Self] {
&[
Self::MustNotExist,
Self::EmptyDirOk,
Self::RemoveAllContents,
]
}
fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
Some(clap::builder::PossibleValue::new(match self {
InitForceMode::MustNotExist => "must-not-exist",
InitForceMode::EmptyDirOk => "empty-dir-ok",
InitForceMode::RemoveAllContents => "remove-all-contents",
}))
}
}
impl SafekeeperConf {
/// Compute is served by port on which only tenant scoped tokens allowed, if
/// it is configured.
@@ -410,7 +384,7 @@ impl LocalEnv {
//
// Initialize a new Neon repository
//
pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
ensure!(
@@ -419,34 +393,25 @@ impl LocalEnv {
);
if base_path.exists() {
match force {
InitForceMode::MustNotExist => {
bail!(
"directory '{}' already exists. Perhaps already initialized?",
base_path.display()
);
}
InitForceMode::EmptyDirOk => {
if let Some(res) = std::fs::read_dir(base_path)?.next() {
res.context("check if directory is empty")?;
anyhow::bail!("directory not empty: {base_path:?}");
}
}
InitForceMode::RemoveAllContents => {
println!("removing all contents of '{}'", base_path.display());
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
// all contents inside. This helps if the developer symbol links another directory (i.e.,
// S3 local SSD) to the `.neon` base directory.
for entry in std::fs::read_dir(base_path)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
fs::remove_dir_all(&path)?;
} else {
fs::remove_file(&path)?;
}
if force {
println!("removing all contents of '{}'", base_path.display());
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
// all contents inside. This helps if the developer symbol links another directory (i.e.,
// S3 local SSD) to the `.neon` base directory.
for entry in std::fs::read_dir(base_path)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
fs::remove_dir_all(&path)?;
} else {
fs::remove_file(&path)?;
}
}
} else {
bail!(
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
base_path.display()
);
}
}

View File

@@ -485,13 +485,6 @@ impl PageServerNode {
Ok(self.http_client.list_timelines(*tenant_id).await?)
}
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
Ok(self
.http_client
.tenant_secondary_download(*tenant_id)
.await?)
}
pub async fn timeline_create(
&self,
tenant_id: TenantId,

View File

@@ -11,7 +11,6 @@ use crate::{
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
};
use pageserver_api::shard::TenantShardId;
use std::collections::HashMap;
use std::time::Duration;
use utils::{
@@ -41,9 +40,9 @@ async fn await_lsn(
loop {
let latest = match get_lsns(tenant_id, pageserver).await {
Ok(l) => l,
Err(_e) => {
Err(e) => {
println!(
"🕑 Waiting for pageserver {} to activate...",
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
pageserver.conf.id
);
std::thread::sleep(Duration::from_millis(500));
@@ -90,7 +89,7 @@ pub async fn migrate_tenant(
tenant_id: TenantId,
dest_ps: PageServerNode,
) -> anyhow::Result<()> {
println!("🤔 Checking existing status...");
// Get a new generation
let attachment_service = AttachmentService::from_env(env);
fn build_location_config(
@@ -136,20 +135,6 @@ pub async fn migrate_tenant(
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
}
println!(
"🔁 Downloading latest layers to destination pageserver {}",
dest_ps.conf.id
);
match dest_ps
.tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
.await
{
Ok(()) => {}
Err(_) => {
println!(" (skipping, destination wasn't in secondary mode)")
}
}
let gen = attachment_service
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;

View File

@@ -1,142 +0,0 @@
# Vectored Timeline Get
Created on: 2024-01-02
Author: Christian Schwarz
# Summary
A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
# Motivation
During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
For an example, see
https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302.
Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example).
For each layer visited by layer map traversal, we do a `DiskBtree` point lookup.
If it's negative (no entry), we resume layer map traversal.
If it's positive, we collect the result in our reconstruct data bag.
If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo.
Otherwise, we resume layer map traversal.
Doing this many `Timeline::get` calls is quite inefficient because:
1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack.
2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys.
This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but
may not contain the data we're looking for.
3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1].
So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do.
The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk.
[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9
# Solution
We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API unlocks:
* more efficient basebackup
* batched IO during compaction (useful for strides of unchanged pages)
* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch)
* if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API
# DoD
There is a new variant of `Timeline::get`, called `Timeline::get_vectored`.
It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`.
It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images.
It is sufficient to simply return a `Vec<Bytes>`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`.
Functionally, the behavior of `Timeline::get_vectored` is equivalent to
```rust
let mut keys_iter: impl Iterator<Item=Key>
= src.map(|KeyVec{ base, count }| (base..base+count)).flatten();
let mut out = Vec::new();
for key in keys_iter {
let data = Timeline::get(key, lsn)?;
out.push(data);
}
return out;
```
However, unlike above, an ideal solution will
* Visit each `struct Layer` at most once.
* For each visited layer, call `Layer::get_value_reconstruct_data` at most once.
* This means, read each `DiskBtree` page at most once.
* Facilitate merging of the reads we issue to the OS and eventually NVMe.
Each of these items above represents a signficant amount of work.
## Performance
Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`.
A reasonable constant overhead over current `Timeline::get` is acceptable.
The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments.
# Implementation
High-level set of tasks / changes to be made:
- **Get clarity on API**:
- Define naive `Timeline::get_vectored` implementation & adopt it across pageserver.
- The tricky thing here will be the return type (e.g. `Vec<Bytes>` vs `impl Stream`).
- Start with something simple to explore the different usages of the API.
Then iterate with peers until we have something that is good enough.
- **Vectored Layer Map traversal**
- Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`)
- Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1
- The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385)
but need more.
Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data.
- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`**
- Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384).
- Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load.
- Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call.
- What needs to happen to `DiskBtree::visit()`?
* Minimally
* take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit.
* Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range
* This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous.
* Ideally:
* Take a `&[KeyVec]`, sort it;
* during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out.
* NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`".
- **Facilitate merging of the reads we issue to the OS and eventually NVMe.**
- The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
- [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
- We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure)
- [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435)
- What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**.
- That is tricky because
- the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824)
- there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`.
- The guiding principle here should be to avoid coupling this work to the `PageCache`.
- I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management.
Let's see how we can improve by doing the first three items in above list first, then revisit.
## Rollout / Feature Flags
No feature flags are required for this epic.
At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change.
It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks.
That will help isolate performance regressions across weekly releases.
# Interaction With Sharding
[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`.
Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard.
Given that this is already the case, there shouldn't be significant interaction/interference with sharding.
However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction.
For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint.

View File

@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
### Obligatory checks
We force code formatting via `ruff`, and type hints via `mypy`.
We force code formatting via `black`, `ruff`, and type hints via `mypy`.
Run the following commands in the repository's root (next to `pyproject.toml`):
```bash
poetry run ruff format . # All code is reformatted
poetry run ruff check . # Python linter
poetry run mypy . # Ensure there are no typing errors
poetry run black . # All code is reformatted
poetry run ruff . # Python linter
poetry run mypy . # Ensure there are no typing errors
```
**WARNING**: do not run `mypy` from a directory other than the root of the repository.

View File

@@ -141,9 +141,8 @@ impl Key {
}
}
#[inline(always)]
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
key.field1 == 0x00 && key.field4 != 0
}
impl std::str::FromStr for Key {

View File

@@ -114,21 +114,16 @@ impl KeySpaceAccum {
}
}
#[inline(always)]
pub fn add_key(&mut self, key: Key) {
self.add_range(singleton_range(key))
}
#[inline(always)]
pub fn add_range(&mut self, range: Range<Key>) {
match self.accum.as_mut() {
Some(accum) => {
if range.start == accum.end {
accum.end = range.end;
} else {
// TODO: to efficiently support small sharding stripe sizes, we should avoid starting
// a new range here if the skipped region was all keys that don't belong on this shard.
// (https://github.com/neondatabase/neon/issues/6247)
assert!(range.start > accum.end);
self.ranges.push(accum.clone());
*accum = range;

View File

@@ -2,7 +2,7 @@ pub mod partitioning;
use std::{
collections::HashMap,
io::{BufRead, Read},
io::Read,
num::{NonZeroU64, NonZeroUsize},
time::SystemTime,
};
@@ -557,6 +557,19 @@ pub enum DownloadRemoteLayersTaskState {
ShutDown,
}
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
/// Information for configuring a single fail point
#[derive(Debug, Serialize, Deserialize)]
pub struct FailpointConfig {
/// Name of the fail point
pub name: String,
/// List of actions to take, using the format described in `fail::cfg`
///
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
pub actions: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>,
@@ -813,10 +826,9 @@ impl PagestreamBeMessage {
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
}
Tag::Error => {
let mut msg = Vec::new();
buf.read_until(0, &mut msg)?;
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
let rust_str = cstring.to_str()?;
let buf = buf.get_ref();
let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
let rust_str = cstr.to_str()?;
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: rust_str.to_owned(),
})

View File

@@ -422,21 +422,6 @@ impl ShardIdentity {
}
}
/// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split
pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0?
// A: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
false
} else {
!self.is_key_local(key)
}
}
pub fn shard_slug(&self) -> String {
if self.count > ShardCount(0) {
format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -530,7 +515,12 @@ fn key_is_shard0(key: &Key) -> bool {
// relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations.
!is_rel_block_key(key)
//
// In this condition:
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
// all metadata.
// - field6 is set to -1 for relation size pages.
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name

View File

@@ -35,12 +35,6 @@ pub enum QueryError {
/// We were instructed to shutdown while processing the query
#[error("Shutting down")]
Shutdown,
/// Query handler indicated that client should reconnect
#[error("Server requested reconnect")]
Reconnect,
/// Query named an entity that was not found
#[error("Not found: {0}")]
NotFound(std::borrow::Cow<'static, str>),
/// Authentication failure
#[error("Unauthorized: {0}")]
Unauthorized(std::borrow::Cow<'static, str>),
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
impl QueryError {
pub fn pg_error_code(&self) -> &'static [u8; 5] {
match self {
Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
}
}
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
info!("Stopped due to shutdown");
Ok(())
}
Err(QueryError::Reconnect) => {
// Dropping out of this loop implicitly disconnects
info!("Stopped due to handler reconnect request");
Ok(())
}
Err(QueryError::Disconnected(e)) => {
info!("Disconnected ({e:#})");
// Disconnection is not an error: we just use it that way internally to drop
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
pub fn short_error(e: &QueryError) -> String {
match e {
QueryError::Disconnected(connection_error) => connection_error.to_string(),
QueryError::Reconnect => "reconnect".to_string(),
QueryError::Shutdown => "shutdown".to_string(),
QueryError::NotFound(_) => "not found".to_string(),
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
QueryError::Other(e) => format!("{e:#}"),
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
QueryError::SimulatedConnectionError => {
error!("query handler for query '{query}' failed due to a simulated connection error")
}
QueryError::Reconnect => {
info!("query handler for '{query}' requested client to reconnect")
}
QueryError::Shutdown => {
info!("query handler for '{query}' cancelled during tenant shutdown")
}
QueryError::NotFound(reason) => {
info!("query handler for '{query}' entity not found: {reason}")
}
QueryError::Unauthorized(e) => {
warn!("query handler for '{query}' failed with authentication error: {e}");
}

View File

@@ -322,12 +322,6 @@ impl RemoteStorage for AzureBlobStorage {
}
Ok(())
}
async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
Err(anyhow::anyhow!(
"copy for azure blob storage is not implemented"
))
}
}
pin_project_lite::pin_project! {

View File

@@ -207,9 +207,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
/// Copy a remote object inside a bucket from one path to another.
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
}
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -377,15 +374,6 @@ impl GenericRemoteStorage {
Self::Unreliable(s) => s.delete_objects(paths).await,
}
}
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.copy(from, to).await,
Self::AwsS3(s) => s.copy(from, to).await,
Self::AzureBlob(s) => s.copy(from, to).await,
Self::Unreliable(s) => s.copy(from, to).await,
}
}
}
impl GenericRemoteStorage {
@@ -672,7 +660,6 @@ impl ConcurrencyLimiter {
RequestKind::Put => &self.write,
RequestKind::List => &self.read,
RequestKind::Delete => &self.write,
RequestKind::Copy => &self.write,
}
}

View File

@@ -409,20 +409,6 @@ impl RemoteStorage for LocalFs {
}
Ok(())
}
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
let from_path = from.with_base(&self.storage_root);
let to_path = to.with_base(&self.storage_root);
create_target_directory(&to_path).await?;
fs::copy(&from_path, &to_path).await.with_context(|| {
format!(
"Failed to copy file from '{from_path}' to '{to_path}'",
from_path = from_path,
to_path = to_path
)
})?;
Ok(())
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {

View File

@@ -493,38 +493,6 @@ impl RemoteStorage for S3Bucket {
Ok(())
}
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
let kind = RequestKind::Copy;
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
// we need to specify bucket_name as a prefix
let copy_source = format!(
"{}/{}",
self.bucket_name,
self.relative_path_to_s3_object(from)
);
let res = self
.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.copy_source(copy_source)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res?;
Ok(())
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
// if prefix is not none then download file `prefix/from`
// if prefix is none then download file `from`

View File

@@ -11,7 +11,6 @@ pub(crate) enum RequestKind {
Put = 1,
Delete = 2,
List = 3,
Copy = 4,
}
use RequestKind::*;
@@ -23,7 +22,6 @@ impl RequestKind {
Put => "put_object",
Delete => "delete_object",
List => "list_objects",
Copy => "copy_object",
}
}
const fn as_index(&self) -> usize {
@@ -31,7 +29,7 @@ impl RequestKind {
}
}
pub(super) struct RequestTyped<C>([C; 5]);
pub(super) struct RequestTyped<C>([C; 4]);
impl<C> RequestTyped<C> {
pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -40,8 +38,8 @@ impl<C> RequestTyped<C> {
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
use RequestKind::*;
let mut it = [Get, Put, Delete, List, Copy].into_iter();
let arr = std::array::from_fn::<C, 5, _>(|index| {
let mut it = [Get, Put, Delete, List].into_iter();
let arr = std::array::from_fn::<C, 4, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(next)

View File

@@ -162,11 +162,4 @@ impl RemoteStorage for UnreliableWrapper {
}
Ok(())
}
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
// copy is equivalent to download + upload
self.attempt(RemoteOp::Download(from.clone()))?;
self.attempt(RemoteOp::Upload(to.clone()))?;
self.inner.copy_object(from, to).await
}
}

View File

@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
#[serde(default)]
pub http_connstr: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct TimelineCopyRequest {
pub target_timeline_id: TimelineId,
pub until_lsn: Lsn,
}

View File

@@ -4,12 +4,6 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
[dependencies]
arc-swap.workspace = true
sentry.workspace = true
@@ -22,7 +16,6 @@ chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}
jsonwebtoken.workspace = true
nix.workspace = true

View File

@@ -1,177 +0,0 @@
//! Failpoint support code shared between pageserver and safekeepers.
use crate::http::{
error::ApiError,
json::{json_request, json_response},
};
use hyper::{Body, Request, Response, StatusCode};
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing::*;
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
/// specified time (in milliseconds). The main difference is that we use async
/// tokio sleep function. Another difference is that we print lines to the log,
/// which can be useful in tests to check that the failpoint was hit.
///
/// Optionally pass a cancellation token, and this failpoint will drop out of
/// its sleep when the cancellation token fires. This is useful for testing
/// cases where we would like to block something, but test its clean shutdown behavior.
#[macro_export]
macro_rules! __failpoint_sleep_millis_async {
($name:literal) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
}
}};
($name:literal, $cancel:expr) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_support::failpoint_sleep_cancellable_helper(
$name,
duration_str,
$cancel,
)
.await
}
}};
}
pub use __failpoint_sleep_millis_async as sleep_millis_async;
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
#[doc(hidden)]
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", name);
}
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
#[doc(hidden)]
pub async fn failpoint_sleep_cancellable_helper(
name: &'static str,
duration_str: String,
cancel: &CancellationToken,
) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::timeout(d, cancel.cancelled()).await.ok();
tracing::info!("failpoint {:?}: sleep done", name);
}
pub fn init() -> fail::FailScenario<'static> {
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
// We want non-default behavior for `exit`, though, so, we handle it separately.
//
// Format for FAILPOINTS is "name=actions" separated by ";".
let actions = std::env::var("FAILPOINTS");
if actions.is_ok() {
std::env::remove_var("FAILPOINTS");
} else {
// let the library handle non-utf8, or nothing for not present
}
let scenario = fail::FailScenario::setup();
if let Ok(val) = actions {
val.split(';')
.enumerate()
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
.for_each(|res| {
let (name, actions) = match res {
Ok(t) => t,
Err((i, s)) => {
panic!(
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
i + 1,
);
}
};
if let Err(e) = apply_failpoint(name, actions) {
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
}
});
}
scenario
}
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
if actions == "exit" {
fail::cfg_callback(name, exit_failpoint)
} else {
fail::cfg(name, actions)
}
}
#[inline(never)]
fn exit_failpoint() {
tracing::info!("Exit requested by failpoint");
std::process::exit(1);
}
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
/// Information for configuring a single fail point
#[derive(Debug, Serialize, Deserialize)]
pub struct FailpointConfig {
/// Name of the fail point
pub name: String,
/// List of actions to take, using the format described in `fail::cfg`
///
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
pub actions: String,
}
/// Configure failpoints through http.
pub async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot manage failpoints because storage was compiled without failpoints support"
)));
}
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
for fp in failpoints {
info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Failed to configure failpoints: {err_msg}"
)));
}
}
json_response(StatusCode::OK, ())
}

View File

@@ -31,9 +31,6 @@ pub enum ApiError {
#[error("Shutting down")]
ShuttingDown,
#[error("Timeout")]
Timeout(Cow<'static, str>),
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -70,10 +67,6 @@ impl ApiError {
err.to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -83,10 +83,6 @@ pub mod timeout;
pub mod sync;
pub mod failpoint_support;
pub mod yielding_loop;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -15,12 +15,6 @@ pub struct Gate {
name: String,
}
impl std::fmt::Debug for Gate {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Gate<{}>", self.name)
}
}
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
/// not complete.
#[derive(Debug)]

View File

@@ -1,35 +0,0 @@
use tokio_util::sync::CancellationToken;
#[derive(thiserror::Error, Debug)]
pub enum YieldingLoopError {
#[error("Cancelled")]
Cancelled,
}
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
/// yields to avoid blocking the executor, and after resuming checks the provided
/// cancellation token to drop out promptly on shutdown.
#[inline(always)]
pub async fn yielding_loop<I, T, F>(
interval: usize,
cancel: &CancellationToken,
iter: I,
mut visitor: F,
) -> Result<(), YieldingLoopError>
where
I: Iterator<Item = T>,
F: FnMut(T),
{
for (i, item) in iter.enumerate() {
visitor(item);
if i + 1 % interval == 0 {
tokio::task::yield_now().await;
if cancel.is_cancelled() {
return Err(YieldingLoopError::Cancelled);
}
}
}
Ok(())
}

View File

@@ -446,11 +446,12 @@ impl Runner {
if let Some(t) = self.last_upscale_request_at {
let elapsed = t.elapsed();
if elapsed < Duration::from_secs(1) {
// *Ideally* we'd like to log here that we're ignoring the fact the
// memory stats are too high, but in practice this can result in
// spamming the logs with repetitive messages about ignoring the signal
//
// See https://github.com/neondatabase/neon/issues/5865 for more.
info!(
elapsed_millis = elapsed.as_millis(),
avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
threshold = bytes_to_mebibytes(cgroup.threshold),
"cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
);
continue;
}
}

View File

@@ -425,7 +425,7 @@ mod tests {
}
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
println!("wp_log[{}] {}", level, msg);
println!("walprop_log[{}] {}", level, msg);
}
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {

View File

@@ -13,7 +13,6 @@ use bytes::{Buf, Bytes};
use pageserver::{
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
};
use pageserver_api::shard::TenantShardId;
use utils::{id::TenantId, lsn::Lsn};
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -27,9 +26,9 @@ fn redo_scenarios(c: &mut Criterion) {
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
let tenant_id = TenantId::generate();
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = PostgresRedoManager::new(conf, tenant_id);
let manager = Arc::new(manager);

View File

@@ -1,4 +1,4 @@
use pageserver_api::{models::*, shard::TenantShardId};
use pageserver_api::models::*;
use reqwest::{IntoUrl, Method};
use utils::{
http::error::HttpErrorBody,
@@ -28,12 +28,14 @@ pub enum Error {
pub type Result<T> = std::result::Result<T, Error>;
pub(crate) trait ResponseErrorMessageExt: Sized {
#[async_trait::async_trait]
pub trait ResponseErrorMessageExt: Sized {
async fn error_from_body(self) -> Result<Self>;
}
#[async_trait::async_trait]
impl ResponseErrorMessageExt for reqwest::Response {
async fn error_from_body(self) -> Result<Self> {
async fn error_from_body(mut self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
@@ -49,11 +51,6 @@ impl ResponseErrorMessageExt for reqwest::Response {
}
}
pub enum ForceAwaitLogicalSize {
Yes,
No,
}
impl Client {
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
Self {
@@ -97,18 +94,11 @@ impl Client {
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
force_await_logical_size: ForceAwaitLogicalSize,
) -> Result<pageserver_api::models::TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
let uri = match force_await_logical_size {
ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true),
ForceAwaitLogicalSize::No => uri,
};
self.get(&uri)
.await?
.json()
@@ -174,18 +164,6 @@ impl Client {
Ok(())
}
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/secondary/download",
self.mgmt_api_endpoint, tenant_id
);
self.request(Method::POST, &uri, ())
.await?
.error_for_status()
.map(|_| ())
.map_err(|e| Error::ApiError(format!("{}", e)))
}
pub async fn location_config(
&self,
tenant_id: TenantId,
@@ -221,16 +199,4 @@ impl Client {
.await
.map_err(Error::ReceiveBody)
}
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/reset",
self.mgmt_api_endpoint, tenant_shard_id
);
self.request(Method::POST, &uri, ())
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -115,8 +115,15 @@ impl PagestreamClient {
pub async fn getpage(
&mut self,
req: PagestreamGetPageRequest,
key: RelTagBlockNo,
lsn: Lsn,
) -> anyhow::Result<PagestreamGetPageResponse> {
let req = PagestreamGetPageRequest {
latest: false,
rel: key.rel_tag,
blkno: key.block_no,
lsn,
};
let req = PagestreamFeMessage::GetPage(req);
let req: bytes::Bytes = req.serialize();
// let mut req = tokio_util::io::ReaderStream::new(&req);

View File

@@ -8,7 +8,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
camino.workspace = true
clap.workspace = true
futures.workspace = true
hdrhistogram.workspace = true
@@ -19,7 +18,6 @@ serde.workspace = true
serde_json.workspace = true
tracing.workspace = true
tokio.workspace = true
tokio-util.workspace = true
pageserver = { path = ".." }
pageserver_client.workspace = true

View File

@@ -1,5 +1,4 @@
use anyhow::Context;
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
use pageserver_client::page_service::BasebackupRequest;
use utils::id::TenantTimelineId;
@@ -93,12 +92,10 @@ async fn main_impl(
for timeline in &timelines {
js.spawn({
let timeline = *timeline;
// FIXME: this triggers initial logical size calculation
// https://github.com/neondatabase/neon/issues/6168
let info = mgmt_api_client
.timeline_info(
timeline.tenant_id,
timeline.timeline_id,
ForceAwaitLogicalSize::No,
)
.timeline_info(timeline.tenant_id, timeline.timeline_id)
.await
.unwrap();
async move {

View File

@@ -1,13 +1,10 @@
use anyhow::Context;
use camino::Utf8PathBuf;
use futures::future::join_all;
use pageserver::pgdatadir_mapping::key_to_rel_block;
use pageserver::repository;
use pageserver_api::key::is_rel_block_key;
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::PagestreamGetPageRequest;
use pageserver_client::page_service::RelTagBlockNo;
use tokio_util::sync::CancellationToken;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
@@ -16,7 +13,7 @@ use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tracing::{info, instrument};
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use std::future::Future;
use std::num::NonZeroUsize;
use std::pin::Pin;
@@ -42,17 +39,8 @@ pub(crate) struct Args {
runtime: Option<humantime::Duration>,
#[clap(long)]
per_target_rate_limit: Option<usize>,
/// Probability for sending `latest=true` in the request (uniform distribution).
#[clap(long, default_value = "1")]
req_latest_probability: f64,
#[clap(long)]
limit_to_first_n_targets: Option<usize>,
/// For large pageserver installations, enumerating the keyspace takes a lot of time.
/// If specified, the specified path is used to maintain a cache of the keyspace enumeration result.
/// The cache is tagged and auto-invalided by the tenant/timeline ids only.
/// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
#[clap(long)]
keyspace_cache: Option<Utf8PathBuf>,
targets: Option<Vec<TenantTimelineId>>,
}
@@ -67,7 +55,7 @@ impl LiveStats {
}
}
#[derive(Clone, serde::Serialize, serde::Deserialize)]
#[derive(Clone)]
struct KeyRange {
timeline: TenantTimelineId,
timeline_lsn: Lsn,
@@ -115,107 +103,59 @@ async fn main_impl(
)
.await?;
#[derive(serde::Deserialize)]
struct KeyspaceCacheDe {
tag: Vec<TenantTimelineId>,
data: Vec<KeyRange>,
}
#[derive(serde::Serialize)]
struct KeyspaceCacheSer<'a> {
tag: &'a [TenantTimelineId],
data: &'a [KeyRange],
}
let cache = args
.keyspace_cache
.as_ref()
.map(|keyspace_cache_file| {
let contents = match std::fs::read(keyspace_cache_file) {
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
return anyhow::Ok(None);
}
x => x.context("read keyspace cache file")?,
};
let cache: KeyspaceCacheDe =
serde_json::from_slice(&contents).context("deserialize cache file")?;
let tag_ok = HashSet::<TenantTimelineId>::from_iter(cache.tag.into_iter())
== HashSet::from_iter(timelines.iter().cloned());
info!("keyspace cache file matches tag: {tag_ok}");
anyhow::Ok(if tag_ok { Some(cache.data) } else { None })
})
.transpose()?
.flatten();
let all_ranges: Vec<KeyRange> = if let Some(cached) = cache {
info!("using keyspace cache file");
cached
} else {
let mut js = JoinSet::new();
for timeline in &timelines {
js.spawn({
let mgmt_api_client = Arc::clone(&mgmt_api_client);
let timeline = *timeline;
async move {
let partitioning = mgmt_api_client
.keyspace(timeline.tenant_id, timeline.timeline_id)
.await?;
let lsn = partitioning.at_lsn;
let start = Instant::now();
let mut filtered = KeySpaceAccum::new();
// let's hope this is inlined and vectorized...
// TODO: turn this loop into a is_rel_block_range() function.
for r in partitioning.keys.ranges.iter() {
let mut i = r.start;
while i != r.end {
if is_rel_block_key(&i) {
filtered.add_key(i);
}
i = i.next();
}
}
let filtered = filtered.to_keyspace();
let filter_duration = start.elapsed();
let mut js = JoinSet::new();
for timeline in &timelines {
js.spawn({
let mgmt_api_client = Arc::clone(&mgmt_api_client);
let timeline = *timeline;
async move {
let partitioning = mgmt_api_client
.keyspace(timeline.tenant_id, timeline.timeline_id)
.await?;
let lsn = partitioning.at_lsn;
anyhow::Ok((
filter_duration,
filtered.ranges.into_iter().map(move |r| KeyRange {
timeline,
timeline_lsn: lsn,
start: r.start.to_i128(),
end: r.end.to_i128(),
}),
))
}
});
}
let mut total_filter_duration = Duration::from_secs(0);
let mut all_ranges: Vec<KeyRange> = Vec::new();
while let Some(res) = js.join_next().await {
let (filter_duration, range) = res.unwrap().unwrap();
all_ranges.extend(range);
total_filter_duration += filter_duration;
}
info!("filter duration: {}", total_filter_duration.as_secs_f64());
if let Some(cachefile) = args.keyspace_cache.as_ref() {
let cache = KeyspaceCacheSer {
tag: &timelines,
data: &all_ranges,
};
let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?;
std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?;
info!("successfully wrote keyspace cache file");
}
all_ranges
};
let ranges = partitioning
.keys
.ranges
.iter()
.filter_map(|r| {
let start = r.start;
let end = r.end;
// filter out non-relblock keys
match (is_rel_block_key(&start), is_rel_block_key(&end)) {
(true, true) => Some(KeyRange {
timeline,
timeline_lsn: lsn,
start: start.to_i128(),
end: end.to_i128(),
}),
(true, false) | (false, true) => {
unimplemented!("split up range")
}
(false, false) => None,
}
})
.collect::<Vec<_>>();
anyhow::Ok(ranges)
}
});
}
let mut all_ranges: Vec<KeyRange> = Vec::new();
while let Some(res) = js.join_next().await {
all_ranges.extend(res.unwrap().unwrap());
}
let live_stats = Arc::new(LiveStats::default());
let num_client_tasks = timelines.len();
let num_live_stats_dump = 1;
let num_work_sender_tasks = 1;
let num_main_impl = 1;
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
));
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
tokio::spawn({
let stats = Arc::clone(&live_stats);
@@ -235,143 +175,112 @@ async fn main_impl(
}
});
let cancel = CancellationToken::new();
let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
let mut work_senders = HashMap::new();
let mut tasks = Vec::new();
for tl in &timelines {
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
work_senders.insert(*tl, sender);
work_senders.insert(tl, sender);
tasks.push(tokio::spawn(client(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
cancel.clone(),
)));
}
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
let start_work_barrier = start_work_barrier.clone();
let cancel = cancel.clone();
match args.per_target_rate_limit {
None => Box::pin(async move {
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
None => Box::pin(async move {
let weights = rand::distributions::weighted::WeightedIndex::new(
all_ranges.iter().map(|v| v.len()),
)
.unwrap();
start_work_barrier.wait().await;
loop {
let (range, key) = {
let mut rng = rand::thread_rng();
let r = &all_ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
(r, RelTagBlockNo { rel_tag, block_no })
};
let sender = work_senders.get(&range.timeline).unwrap();
// TODO: what if this blocks?
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
}
}),
Some(rps_limit) => Box::pin(async move {
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
let make_timeline_task: &dyn Fn(
TenantTimelineId,
)
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
let sender = work_senders.get(&timeline).unwrap();
let ranges: Vec<KeyRange> = all_ranges
.iter()
.filter(|r| r.timeline == timeline)
.cloned()
.collect();
let weights = rand::distributions::weighted::WeightedIndex::new(
all_ranges.iter().map(|v| v.len()),
ranges.iter().map(|v| v.len()),
)
.unwrap();
start_work_barrier.wait().await;
while !cancel.is_cancelled() {
let (timeline, req) = {
let mut rng = rand::thread_rng();
let r = &all_ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
(
r.timeline,
PagestreamGetPageRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
},
)
};
let sender = work_senders.get(&timeline).unwrap();
// TODO: what if this blocks?
if sender.send(req).await.is_err() {
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
Box::pin(async move {
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(
/* TODO review this choice */
tokio::time::MissedTickBehavior::Burst,
);
loop {
ticker.tick().await;
let (range, key) = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
let (rel_tag, block_no) = key_to_rel_block(key)
.expect("we filter non-rel-block keys out above");
(r, RelTagBlockNo { rel_tag, block_no })
};
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
}
}
}),
Some(rps_limit) => Box::pin(async move {
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
let make_timeline_task: &dyn Fn(
TenantTimelineId,
)
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
let sender = work_senders.get(&timeline).unwrap();
let ranges: Vec<KeyRange> = all_ranges
.iter()
.filter(|r| r.timeline == timeline)
.cloned()
.collect();
let weights = rand::distributions::weighted::WeightedIndex::new(
ranges.iter().map(|v| v.len()),
)
.unwrap();
})
};
let cancel = cancel.clone();
Box::pin(async move {
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(
/* TODO review this choice */
tokio::time::MissedTickBehavior::Burst,
);
while !cancel.is_cancelled() {
ticker.tick().await;
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
assert!(is_rel_block_key(&key));
let (rel_tag, block_no) = key_to_rel_block(key)
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}
};
if sender.send(req).await.is_err() {
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
}
}
})
};
let tasks: Vec<_> = work_senders
.keys()
.map(|tl| make_timeline_task(**tl))
.collect();
let tasks: Vec<_> = work_senders
.keys()
.map(|tl| make_timeline_task(*tl))
.collect();
start_work_barrier.wait().await;
start_work_barrier.wait().await;
join_all(tasks).await;
}),
}
join_all(tasks).await;
}),
};
let work_sender_task = tokio::spawn(work_sender);
if let Some(runtime) = args.runtime {
info!("waiting for everything to become ready");
start_work_barrier.wait().await;
info!("work started");
tokio::time::sleep(runtime.into()).await;
info!("runtime over, signalling cancellation");
cancel.cancel();
work_sender_task.await.unwrap();
info!("work sender exited");
match tokio::time::timeout(runtime.into(), work_sender).await {
Ok(()) => unreachable!("work sender never terminates"),
Err(_timeout) => {
// this implicitly drops the work_senders, making all the clients exit
}
}
} else {
work_sender_task.await.unwrap();
work_sender.await;
unreachable!("work sender never terminates");
}
info!("joining clients");
for t in tasks {
t.await.unwrap();
}
info!("all clients stopped");
let output = Output {
total: {
let mut agg_stats = request_stats::Stats::new();
@@ -394,10 +303,12 @@ async fn client(
args: &'static Args,
timeline: TenantTimelineId,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
all_work_done_barrier: Arc<Barrier>,
live_stats: Arc<LiveStats>,
cancel: CancellationToken,
) {
start_work_barrier.wait().await;
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
@@ -406,18 +317,12 @@ async fn client(
.await
.unwrap();
start_work_barrier.wait().await;
while let Some(req) =
tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
{
while let Some((key, lsn)) = work.recv().await {
let start = Instant::now();
let res = tokio::select! {
res = client.getpage(req) => { res },
_ = cancel.cancelled() => { return; }
};
res.with_context(|| format!("getpage for {timeline}"))
client
.getpage(key, lsn)
.await
.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
@@ -425,4 +330,6 @@ async fn client(
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
all_work_done_barrier.wait().await;
}

View File

@@ -4,8 +4,6 @@ use humantime::Duration;
use tokio::task::JoinSet;
use utils::id::TenantTimelineId;
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
@@ -58,15 +56,14 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
for tl in timelines {
let mgmt_api_client = Arc::clone(&mgmt_api_client);
js.spawn(async move {
// TODO: API to explicitly trigger initial logical size computation.
// Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
// => https://github.com/neondatabase/neon/issues/6168
let info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
.timeline_info(tl.tenant_id, tl.timeline_id)
.await
.unwrap();
// Polling should not be strictly required here since we await
// for the initial logical size, however it's possible for the request
// to land before the timeline is initialised. This results in an approximate
// logical size.
if let Some(period) = args.poll_for_completion {
let mut ticker = tokio::time::interval(period.into());
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
@@ -74,7 +71,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
while !info.current_logical_size_is_accurate {
ticker.tick().await;
info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
.timeline_info(tl.tenant_id, tl.timeline_id)
.await
.unwrap();
}

View File

@@ -23,7 +23,6 @@ use tracing::*;
use tokio_tar::{Builder, EntryType, Header};
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -175,7 +174,7 @@ where
] {
for segno in self
.timeline
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
.list_slru_segments(kind, self.lsn, self.ctx)
.await?
{
self.add_slru_segment(kind, segno).await?;
@@ -193,7 +192,7 @@ where
// Otherwise only include init forks of unlogged relations.
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
@@ -268,7 +267,7 @@ where
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
.get_rel_size(src, self.lsn, false, self.ctx)
.await?;
// If the relation is empty, create an empty file
@@ -289,7 +288,7 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
.await?;
segment_data.extend_from_slice(&img[..]);
}
@@ -311,7 +310,7 @@ where
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
.await?;
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -353,7 +352,7 @@ where
let relmap_img = if has_relmap_file {
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
.await?;
ensure!(
@@ -400,7 +399,7 @@ where
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?
.is_empty()
{

View File

@@ -31,7 +31,6 @@ use pageserver::{
virtual_file,
};
use postgres_backend::AuthType;
use utils::failpoint_support;
use utils::logging::TracingErrorLayerEnablement;
use utils::signals::ShutdownSignals;
use utils::{
@@ -127,7 +126,7 @@ fn main() -> anyhow::Result<()> {
}
// Initialize up failpoints support
let scenario = failpoint_support::init();
let scenario = pageserver::failpoint_support::init();
// Basic initialization of things that don't change after startup
virtual_file::init(conf.max_file_descriptors);

View File

@@ -37,8 +37,8 @@ use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
};
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,9 +75,6 @@ pub mod defaults {
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
///
/// Default built-in configuration file.
@@ -91,7 +88,6 @@ pub mod defaults {
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
# initial superuser role name to use when creating a new tenant
@@ -112,8 +108,6 @@ pub mod defaults {
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -131,7 +125,6 @@ pub mod defaults {
#gc_feedback = false
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
[remote_storage]
@@ -240,13 +233,6 @@ pub struct PageServerConf {
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
/// heatmap uploads vs. other remote storage operations.
pub heatmap_upload_concurrency: usize,
/// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly
/// deprioritises secondary downloads vs. remote storage operations for attached tenants.
pub secondary_download_concurrency: usize,
/// Maximum number of WAL records to be ingested and committed at the same time
pub ingest_batch_size: u64,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -328,9 +314,6 @@ struct PageServerConfigBuilder {
control_plane_emergency_mode: BuilderValue<bool>,
heatmap_upload_concurrency: BuilderValue<usize>,
secondary_download_concurrency: BuilderValue<usize>,
ingest_batch_size: BuilderValue<u64>,
}
impl Default for PageServerConfigBuilder {
@@ -403,9 +386,6 @@ impl Default for PageServerConfigBuilder {
control_plane_emergency_mode: Set(false),
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
}
}
}
@@ -554,14 +534,6 @@ impl PageServerConfigBuilder {
self.heatmap_upload_concurrency = BuilderValue::Set(value)
}
pub fn secondary_download_concurrency(&mut self, value: usize) {
self.secondary_download_concurrency = BuilderValue::Set(value)
}
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_warmup = self
.concurrent_tenant_warmup
@@ -660,15 +632,10 @@ impl PageServerConfigBuilder {
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
heatmap_upload_concurrency: self
.heatmap_upload_concurrency
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
secondary_download_concurrency: self
.secondary_download_concurrency
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
ingest_batch_size: self
.ingest_batch_size
.ok_or(anyhow!("missing ingest_batch_size"))?,
})
}
}
@@ -726,11 +693,6 @@ impl PageServerConf {
.join(TENANT_LOCATION_CONFIG_NAME)
}
pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(TENANT_HEATMAP_BASENAME)
}
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(TIMELINES_SEGMENT_NAME)
@@ -916,10 +878,6 @@ impl PageServerConf {
"heatmap_upload_concurrency" => {
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
},
"secondary_download_concurrency" => {
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
},
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -991,8 +949,6 @@ impl PageServerConf {
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
}
}
}
@@ -1221,9 +1177,7 @@ background_task_maximum_delay = '334 s'
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
},
"Correct defaults should be used when no config values are provided"
);
@@ -1284,9 +1238,7 @@ background_task_maximum_delay = '334 s'
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: 100,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -1,6 +1,5 @@
use std::collections::HashMap;
use futures::Future;
use pageserver_api::{
control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
@@ -29,14 +28,13 @@ pub enum RetryForeverError {
ShuttingDown,
}
#[async_trait::async_trait]
pub trait ControlPlaneGenerationsApi {
fn re_attach(
&self,
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
fn validate(
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
}
impl ControlPlaneClient {
@@ -125,6 +123,7 @@ impl ControlPlaneClient {
}
}
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {

View File

@@ -831,6 +831,7 @@ mod test {
}
}
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for MockControlPlane {
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {

View File

@@ -0,0 +1,86 @@
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
/// specified time (in milliseconds). The main difference is that we use async
/// tokio sleep function. Another difference is that we print lines to the log,
/// which can be useful in tests to check that the failpoint was hit.
#[macro_export]
macro_rules! __failpoint_sleep_millis_async {
($name:literal) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
}
}};
}
pub use __failpoint_sleep_millis_async as sleep_millis_async;
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
#[doc(hidden)]
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", name);
}
pub fn init() -> fail::FailScenario<'static> {
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
// We want non-default behavior for `exit`, though, so, we handle it separately.
//
// Format for FAILPOINTS is "name=actions" separated by ";".
let actions = std::env::var("FAILPOINTS");
if actions.is_ok() {
std::env::remove_var("FAILPOINTS");
} else {
// let the library handle non-utf8, or nothing for not present
}
let scenario = fail::FailScenario::setup();
if let Ok(val) = actions {
val.split(';')
.enumerate()
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
.for_each(|res| {
let (name, actions) = match res {
Ok(t) => t,
Err((i, s)) => {
panic!(
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
i + 1,
);
}
};
if let Err(e) = apply_failpoint(name, actions) {
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
}
});
}
scenario
}
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
if actions == "exit" {
fail::cfg_callback(name, exit_failpoint)
} else {
fail::cfg(name, actions)
}
}
#[inline(never)]
fn exit_failpoint() {
tracing::info!("Exit requested by failpoint");
std::process::exit(1);
}

View File

@@ -15,7 +15,6 @@ use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
TenantLoadRequest, TenantLocationConfigRequest,
@@ -26,7 +25,6 @@ use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::JwtAuth;
use utils::failpoint_support::failpoints_handler;
use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -38,7 +36,6 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::{LocationConf, TenantConfOpt};
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::UpsertLocationError;
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
@@ -48,8 +45,7 @@ use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::Timeline;
use crate::tenant::SpawnMode;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use pageserver_api::models::{
@@ -70,6 +66,9 @@ use utils::{
lsn::Lsn,
};
// Imports only used for testing APIs
use pageserver_api::models::ConfigureFailpointsRequest;
// For APIs that require an Active tenant, how long should we block waiting for that state?
// This is not functionally necessary (clients will retry), but avoids generating a lot of
// failed API calls while tenants are activating.
@@ -115,6 +114,14 @@ impl State {
secondary_controller,
})
}
fn tenant_resources(&self) -> TenantSharedResources {
TenantSharedResources {
broker_client: self.broker_client.clone(),
remote_storage: self.remote_storage.clone(),
deletion_queue_client: self.deletion_queue_client.clone(),
}
}
}
#[inline(always)]
@@ -147,7 +154,6 @@ impl From<PageReconstructError> for ApiError {
PageReconstructError::AncestorStopping(_) => {
ApiError::ResourceUnavailable(format!("{pre}").into())
}
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
}
}
@@ -170,7 +176,7 @@ impl From<TenantSlotError> for ApiError {
NotFound(tenant_id) => {
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
}
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
InProgress => {
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
}
@@ -189,18 +195,6 @@ impl From<TenantSlotUpsertError> for ApiError {
}
}
impl From<UpsertLocationError> for ApiError {
fn from(e: UpsertLocationError) -> ApiError {
use UpsertLocationError::*;
match e {
BadRequest(e) => ApiError::BadRequest(e),
Unavailable(_) => ApiError::ShuttingDown,
e @ InProgress => ApiError::Conflict(format!("{e}")),
Flush(e) | Other(e) => ApiError::InternalServerError(e),
}
}
}
impl From<TenantMapError> for ApiError {
fn from(e: TenantMapError) -> ApiError {
use TenantMapError::*;
@@ -323,21 +317,11 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
async fn build_timeline_info(
timeline: &Arc<Timeline>,
include_non_incremental_logical_size: bool,
force_await_initial_logical_size: bool,
ctx: &RequestContext,
) -> anyhow::Result<TimelineInfo> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
if force_await_initial_logical_size {
timeline.clone().await_initial_logical_size().await
}
let mut info = build_timeline_info_common(
timeline,
ctx,
tenant::timeline::GetLogicalSizePriority::Background,
)
.await?;
let mut info = build_timeline_info_common(timeline, ctx).await?;
if include_non_incremental_logical_size {
// XXX we should be using spawn_ondemand_logical_size_calculation here.
// Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -354,7 +338,6 @@ async fn build_timeline_info(
async fn build_timeline_info_common(
timeline: &Arc<Timeline>,
ctx: &RequestContext,
logical_size_task_priority: tenant::timeline::GetLogicalSizePriority,
) -> anyhow::Result<TimelineInfo> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
let initdb_lsn = timeline.initdb_lsn;
@@ -377,7 +360,8 @@ async fn build_timeline_info_common(
Lsn(0) => None,
lsn @ Lsn(_) => Some(lsn),
};
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
let current_logical_size =
timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
let remote_consistent_lsn_projected = timeline
@@ -488,7 +472,7 @@ async fn timeline_create_handler(
.await {
Ok(new_timeline) => {
// Created. Construct a TimelineInfo for it.
let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
@@ -524,8 +508,6 @@ async fn timeline_list_handler(
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
let force_await_initial_logical_size: Option<bool> =
parse_query_param(&request, "force-await-initial-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -539,7 +521,6 @@ async fn timeline_list_handler(
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size.unwrap_or(false),
force_await_initial_logical_size.unwrap_or(false),
&ctx,
)
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -567,8 +548,6 @@ async fn timeline_detail_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
let force_await_initial_logical_size: Option<bool> =
parse_query_param(&request, "force-await-initial-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
// Logical size calculation needs downloading.
@@ -584,7 +563,6 @@ async fn timeline_detail_handler(
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size.unwrap_or(false),
force_await_initial_logical_size.unwrap_or(false),
&ctx,
)
.await
@@ -703,37 +681,16 @@ async fn tenant_attach_handler(
)));
}
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let location_conf = LocationConf::attached_single(tenant_conf, generation);
let tenant = state
.tenant_manager
.upsert_location(
tenant_shard_id,
location_conf,
None,
SpawnMode::Normal,
&ctx,
)
.await?;
let Some(tenant) = tenant else {
// This should never happen: indicates a bug in upsert_location
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Upsert succeeded but didn't return tenant!"
)));
};
// We might have successfully constructed a Tenant, but it could still
// end up in a broken state:
if let TenantState::Broken {
reason,
backtrace: _,
} = tenant.current_state()
{
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Tenant state is Broken: {reason}"
)));
}
mgr::attach_tenant(
state.conf,
tenant_id,
generation,
tenant_conf,
state.tenant_resources(),
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -1192,25 +1149,16 @@ async fn tenant_create_handler(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let location_conf = LocationConf::attached_single(tenant_conf, generation);
let new_tenant = state
.tenant_manager
.upsert_location(
target_tenant_id,
location_conf,
None,
SpawnMode::Create,
&ctx,
)
.await?;
let Some(new_tenant) = new_tenant else {
// This should never happen: indicates a bug in upsert_location
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Upsert succeeded but didn't return tenant!"
)));
};
let new_tenant = mgr::create_tenant(
state.conf,
tenant_conf,
target_tenant_id,
generation,
state.tenant_resources(),
&ctx,
)
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
.await?;
// We created the tenant. Existing API semantics are that the tenant
// is Active when this function returns.
@@ -1219,7 +1167,7 @@ async fn tenant_create_handler(
.await
{
// This shouldn't happen because we just created the tenant directory
// in upsert_location, and there aren't any remote timelines
// in tenant::mgr::create_tenant, and there aren't any remote timelines
// to load, so, nothing can really fail during load.
// Don't do cleanup because we don't know how we got here.
// The tenant will likely be in `Broken` state and subsequent
@@ -1320,31 +1268,12 @@ async fn put_tenant_location_config_handler(
state
.tenant_manager
.upsert_location(
tenant_shard_id,
location_conf,
flush,
tenant::SpawnMode::Normal,
&ctx,
)
.await?;
if let Some(_flush_ms) = flush {
match state
.secondary_controller
.upload_tenant(tenant_shard_id)
.await
{
Ok(()) => {
tracing::info!("Uploaded heatmap during flush");
}
Err(e) => {
tracing::warn!("Failed to flush heatmap: {e}");
}
}
} else {
tracing::info!("No flush requested when configuring");
}
.upsert_location(tenant_shard_id, location_conf, flush, &ctx)
.await
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
// principle we might have hit something like concurrent API calls to the same tenant,
// which is not a 400 but a 409.
.map_err(ApiError::BadRequest)?;
json_response(StatusCode::OK, ())
}
@@ -1364,6 +1293,34 @@ async fn handle_tenant_break(
json_response(StatusCode::OK, ())
}
async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow!(
"Cannot manage failpoints because pageserver was compiled without failpoints support"
)));
}
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
for fp in failpoints {
info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow!(
"Failed to configure failpoints: {err_msg}"
)));
}
}
json_response(StatusCode::OK, ())
}
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
@@ -1683,21 +1640,6 @@ async fn secondary_upload_handler(
json_response(StatusCode::OK, ())
}
async fn secondary_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
state
.secondary_controller
.download_tenant(tenant_shard_id)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(
StatusCode::NOT_FOUND,
@@ -1966,9 +1908,6 @@ pub fn make_router(
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
api_handler(r, secondary_download_handler)
})
.put("/v1/tenant/:tenant_shard_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})

View File

@@ -21,7 +21,6 @@ use tracing::*;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::Timeline;
@@ -313,16 +312,13 @@ async fn import_wal(
waldecoder.feed_bytes(&buf);
let mut nrecords = 0;
let mut modification = tline.begin_modification(last_lsn);
let mut modification = tline.begin_modification(endpoint);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
WAL_INGEST.records_committed.inc();
modification.commit(ctx).await?;
last_lsn = lsn;
nrecords += 1;
@@ -452,14 +448,13 @@ pub async fn import_wal_from_tar(
waldecoder.feed_bytes(&bytes[offset..]);
let mut modification = tline.begin_modification(last_lsn);
let mut modification = tline.begin_modification(end_lsn);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
modification.commit(ctx).await?;
last_lsn = lsn;
debug!("imported record at {} (end {})", lsn, end_lsn);

View File

@@ -25,6 +25,8 @@ pub mod walingest;
pub mod walrecord;
pub mod walredo;
pub mod failpoint_support;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
@@ -117,10 +119,6 @@ pub const TENANT_CONFIG_NAME: &str = "config";
/// Full path: `tenants/<tenant_id>/config`.
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
/// Per-tenant copy of their remote heatmap, downloaded into the local
/// tenant path while in secondary mode.
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
/// A suffix used for various temporary files. Any temporary files found in the
/// data directory at pageserver startup can be automatically removed.
pub const TEMP_FILE_SUFFIX: &str = "___temp";

View File

@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
// Metrics collected on operations on the storage repository.
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum StorageTimeOperation {
pub enum StorageTimeOperation {
#[strum(serialize = "layer flush")]
LayerFlush,
@@ -55,7 +55,7 @@ pub(crate) enum StorageTimeOperation {
CreateTenant,
}
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
register_counter_vec!(
"pageserver_storage_operations_seconds_sum",
"Total time spent on storage operations with operation, tenant and timeline dimensions",
@@ -64,7 +64,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
.expect("failed to define a metric")
});
pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_storage_operations_seconds_count",
"Count of storage operations with operation, tenant and timeline dimensions",
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) struct PageCacheMetricsForTaskKind {
pub struct PageCacheMetricsForTaskKind {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_immutable: IntCounter,
@@ -159,7 +159,7 @@ pub(crate) struct PageCacheMetricsForTaskKind {
pub read_hits_materialized_page_older_lsn: IntCounter,
}
pub(crate) struct PageCacheMetrics {
pub struct PageCacheMetrics {
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
}
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
let task_kind: &'static str = task_kind.into();
@@ -243,9 +243,10 @@ impl PageCacheMetrics {
}
}
pub(crate) struct PageCacheSizeMetrics {
pub struct PageCacheSizeMetrics {
pub max_bytes: UIntGauge,
pub current_bytes_ephemeral: UIntGauge,
pub current_bytes_immutable: UIntGauge,
pub current_bytes_materialized_page: UIntGauge,
}
@@ -259,26 +260,31 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
Lazy::new(|| PageCacheSizeMetrics {
max_bytes: {
register_uint_gauge!(
"pageserver_page_cache_size_max_bytes",
"Maximum size of the page cache in bytes"
)
.expect("failed to define a metric")
},
current_bytes_immutable: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
current_bytes_materialized_page: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
},
});
pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
max_bytes: {
register_uint_gauge!(
"pageserver_page_cache_size_max_bytes",
"Maximum size of the page cache in bytes"
)
.expect("failed to define a metric")
},
current_bytes_ephemeral: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["ephemeral"])
.unwrap()
},
current_bytes_immutable: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
current_bytes_materialized_page: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
},
});
pub(crate) mod page_cache_eviction_metrics {
use std::num::NonZeroUsize;
@@ -734,13 +740,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub(crate) struct EvictionsWithLowResidenceDuration {
pub struct EvictionsWithLowResidenceDuration {
data_source: &'static str,
threshold: Duration,
counter: Option<IntCounter>,
}
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
pub struct EvictionsWithLowResidenceDurationBuilder {
data_source: &'static str,
threshold: Duration,
}
@@ -1003,7 +1009,7 @@ pub enum SmgrQueryType {
}
#[derive(Debug)]
pub(crate) struct SmgrQueryTimePerTimeline {
pub struct SmgrQueryTimePerTimeline {
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
}
@@ -1175,8 +1181,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
.map(|ms| (ms as f64) / 1000.0)
});
pub(crate) struct BasebackupQueryTime(HistogramVec);
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
pub struct BasebackupQueryTime(HistogramVec);
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
BasebackupQueryTime({
register_histogram_vec!(
"pageserver_basebackup_query_seconds",
@@ -1196,7 +1202,7 @@ impl DurationResultObserver for BasebackupQueryTime {
}
}
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_live_connections",
"Number of live network connections",
@@ -1363,8 +1369,6 @@ pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) upload_heatmap_errors: IntCounter,
pub(crate) upload_heatmap_duration: Histogram,
pub(crate) download_heatmap: IntCounter,
pub(crate) download_layer: IntCounter,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
@@ -1382,16 +1386,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
download_heatmap: register_int_counter!(
"pageserver_secondary_download_heatmap",
"Number of downloads of heatmaps by secondary mode locations"
)
.expect("failed to define a metric"),
download_layer: register_int_counter!(
"pageserver_secondary_download_layer",
"Number of downloads of layers by secondary mode locations"
)
.expect("failed to define a metric"),
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1661,7 +1655,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default);
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub(crate) struct StorageTimeMetricsTimer {
pub struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics,
start: Instant,
}
@@ -1686,7 +1680,7 @@ impl StorageTimeMetricsTimer {
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
/// timeline total sum and count.
#[derive(Clone, Debug)]
pub(crate) struct StorageTimeMetrics {
pub struct StorageTimeMetrics {
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
timeline_sum: Counter,
/// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1725,7 +1719,7 @@ impl StorageTimeMetrics {
}
#[derive(Debug)]
pub(crate) struct TimelineMetrics {
pub struct TimelineMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
@@ -1933,7 +1927,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
}
}
pub(crate) struct RemoteTimelineClientMetrics {
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
@@ -2231,7 +2225,7 @@ impl Drop for RemoteTimelineClientMetrics {
/// Wrapper future that measures the time spent by a remote storage operation,
/// and records the time and success/failure as a prometheus metric.
pub(crate) trait MeasureRemoteOp: Sized {
pub trait MeasureRemoteOp: Sized {
fn measure_remote_op(
self,
tenant_id: TenantId,
@@ -2256,7 +2250,7 @@ pub(crate) trait MeasureRemoteOp: Sized {
impl<T: Sized> MeasureRemoteOp for T {}
pin_project! {
pub(crate) struct MeasuredRemoteOp<F>
pub struct MeasuredRemoteOp<F>
{
#[pin]
inner: F,

View File

@@ -25,7 +25,6 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
use std::borrow::Cow;
use std::io;
use std::net::TcpListener;
use std::pin::pin;
@@ -54,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
use crate::pgdatadir_mapping::rel_block_to_key;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -62,9 +61,6 @@ use crate::tenant::mgr;
use crate::tenant::mgr::get_active_tenant_with_timeout;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::trace::Tracer;
@@ -287,64 +283,6 @@ struct PageServerHandler {
connection_ctx: RequestContext,
}
#[derive(thiserror::Error, Debug)]
enum PageStreamError {
/// We encountered an error that should prompt the client to reconnect:
/// in practice this means we drop the connection without sending a response.
#[error("Reconnect required: {0}")]
Reconnect(Cow<'static, str>),
/// We were instructed to shutdown while processing the query
#[error("Shutting down")]
Shutdown,
/// Something went wrong reading a page: this likely indicates a pageserver bug
#[error("Read error: {0}")]
Read(PageReconstructError),
/// Ran out of time waiting for an LSN
#[error("LSN timeout: {0}")]
LsnTimeout(WaitLsnError),
/// The entity required to serve the request (tenant or timeline) is not found,
/// or is not found in a suitable state to serve a request.
#[error("Not found: {0}")]
NotFound(std::borrow::Cow<'static, str>),
/// Request asked for something that doesn't make sense, like an invalid LSN
#[error("Bad request: {0}")]
BadRequest(std::borrow::Cow<'static, str>),
}
impl From<PageReconstructError> for PageStreamError {
fn from(value: PageReconstructError) -> Self {
match value {
PageReconstructError::Cancelled => Self::Shutdown,
e => Self::Read(e),
}
}
}
impl From<GetActiveTimelineError> for PageStreamError {
fn from(value: GetActiveTimelineError) -> Self {
match value {
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
}
}
}
impl From<WaitLsnError> for PageStreamError {
fn from(value: WaitLsnError) -> Self {
match value {
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
WaitLsnError::Shutdown => Self::Shutdown,
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
}
}
}
impl PageServerHandler {
pub fn new(
conf: &'static PageServerConf,
@@ -490,7 +428,7 @@ impl PageServerHandler {
// Check that the timeline exists
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
.map_err(|e| anyhow::anyhow!(e))?;
// Avoid starting new requests if the timeline has already started shutting down,
// and block timeline shutdown until this request is complete, or drops out due
@@ -582,44 +520,32 @@ impl PageServerHandler {
}
};
match response {
Err(PageStreamError::Shutdown) => {
if let Err(e) = &response {
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
// because wait_lsn etc will drop out
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
// is_canceled(): [`Timeline::shutdown`]` has entered
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
span.in_scope(|| info!("dropping connection due to shutdown"));
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
return Err(QueryError::Shutdown);
}
Err(PageStreamError::Reconnect(reason)) => {
span.in_scope(|| info!("handler requested reconnect: {reason}"));
return Err(QueryError::Reconnect);
}
Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
//
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
// because wait_lsn etc will drop out
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
// is_canceled(): [`Timeline::shutdown`]` has entered
span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
return Err(QueryError::Shutdown);
}
r => {
let response_msg = r.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
});
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
}
}
let response = response.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
});
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
}
Ok(())
}
@@ -766,7 +692,7 @@ impl PageServerHandler {
latest: bool,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
ctx: &RequestContext,
) -> Result<Lsn, PageStreamError> {
) -> anyhow::Result<Lsn> {
if latest {
// Latest page version was requested. If LSN is given, it is a hint
// to the page server that there have been no modifications to the
@@ -797,19 +723,15 @@ impl PageServerHandler {
}
} else {
if lsn == Lsn(0) {
return Err(PageStreamError::BadRequest(
"invalid LSN(0) in request".into(),
));
anyhow::bail!("invalid LSN(0) in request");
}
timeline.wait_lsn(lsn, ctx).await?;
}
if lsn < **latest_gc_cutoff_lsn {
return Err(PageStreamError::BadRequest(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
).into()));
}
anyhow::ensure!(
lsn >= **latest_gc_cutoff_lsn,
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
);
Ok(lsn)
}
@@ -818,14 +740,14 @@ impl PageServerHandler {
timeline: &Timeline,
req: &PagestreamExistsRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let exists = timeline
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
.get_rel_exists(req.rel, lsn, req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -838,15 +760,13 @@ impl PageServerHandler {
timeline: &Timeline,
req: &PagestreamNblocksRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let n_blocks = timeline
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
.await?;
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
n_blocks,
@@ -858,20 +778,14 @@ impl PageServerHandler {
timeline: &Timeline,
req: &PagestreamDbSizeRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let total_blocks = timeline
.get_db_size(
DEFAULTTABLESPACE_OID,
req.dbnode,
Version::Lsn(lsn),
req.latest,
ctx,
)
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
@@ -880,35 +794,30 @@ impl PageServerHandler {
}))
}
async fn do_handle_get_page_at_lsn_request(
&self,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let page = timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
}))
}
async fn handle_get_page_at_lsn_request(
&self,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
/*
// Add a 1s delay to some requests. The delay helps the requests to
// hit the race condition from github issue #1047 more easily.
use rand::Rng;
if rand::thread_rng().gen::<u8>() < 5 {
std::thread::sleep(std::time::Duration::from_millis(1000));
}
*/
let key = rel_block_to_key(req.rel, req.blkno);
if timeline.get_shard_identity().is_key_local(&key) {
self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
.await
let page = if timeline.get_shard_identity().is_key_local(&key) {
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
} else {
// The Tenant shard we looked up at connection start does not hold this particular
// key: look for other shards in this tenant. This scenario occurs if a pageserver
@@ -927,30 +836,30 @@ impl PageServerHandler {
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node: the client's knowledge of shard->pageserver
// mapping is out of date.
tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
// and talk to a different pageserver.
return Err(PageStreamError::Reconnect(
"getpage@lsn request routed to wrong shard".into(),
));
// the requested page is not present on this node.
// TODO: this should be some kind of structured error that the client will understand,
// so that it can block until its config is updated: this error is expected in the case
// that the Tenant's shards' placements are being updated and the client hasn't been
// informed yet.
//
// https://github.com/neondatabase/neon/issues/6038
return Err(anyhow::anyhow!("Request routed to wrong shard"));
}
Err(e) => return Err(e.into()),
};
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
// the GateGuard was already held over the whole connection.
let _timeline_guard = timeline
.gate
.enter()
.map_err(|_| PageStreamError::Shutdown)?;
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
};
self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
.await
}
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
}))
}
#[allow(clippy::too_many_arguments)]
@@ -1091,7 +1000,9 @@ impl PageServerHandler {
)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant.get_timeline(timeline_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
Ok(timeline)
}
}
@@ -1500,8 +1411,7 @@ impl From<GetActiveTenantError> for QueryError {
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
),
GetActiveTenantError::Cancelled
| GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
QueryError::Shutdown
}
e => QueryError::Other(anyhow::anyhow!(e)),
@@ -1514,15 +1424,14 @@ enum GetActiveTimelineError {
#[error(transparent)]
Tenant(GetActiveTenantError),
#[error(transparent)]
Timeline(#[from] GetTimelineError),
Timeline(anyhow::Error),
}
impl From<GetActiveTimelineError> for QueryError {
fn from(e: GetActiveTimelineError) -> Self {
match e {
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
GetActiveTimelineError::Tenant(e) => e.into(),
GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
}
}
}

View File

@@ -11,7 +11,7 @@ use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::walrecord::NeonWalRecord;
use anyhow::{ensure, Context};
use anyhow::Context;
use bytes::{Buf, Bytes};
use pageserver_api::key::is_rel_block_key;
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,7 +147,6 @@ impl Timeline {
{
DatadirModification {
tline: self,
pending_lsns: Vec::new(),
pending_updates: HashMap::new(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
@@ -160,11 +159,11 @@ impl Timeline {
//------------------------------------------------------------------------------
/// Look up given page version.
pub(crate) async fn get_rel_page_at_lsn(
pub async fn get_rel_page_at_lsn(
&self,
tag: RelTag,
blknum: BlockNumber,
version: Version<'_>,
lsn: Lsn,
latest: bool,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
@@ -174,47 +173,44 @@ impl Timeline {
));
}
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag,
blknum,
version.get_lsn(),
nblocks
tag, blknum, lsn, nblocks
);
return Ok(ZERO_PAGE.clone());
}
let key = rel_block_to_key(tag, blknum);
version.get(self, key, ctx).await
self.get(key, lsn, ctx).await
}
// Get size of a database in blocks
pub(crate) async fn get_db_size(
pub async fn get_db_size(
&self,
spcnode: Oid,
dbnode: Oid,
version: Version<'_>,
lsn: Lsn,
latest: bool,
ctx: &RequestContext,
) -> Result<usize, PageReconstructError> {
let mut total_blocks = 0;
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
for rel in rels {
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
total_blocks += n_blocks as usize;
}
Ok(total_blocks)
}
/// Get size of a relation file
pub(crate) async fn get_rel_size(
pub async fn get_rel_size(
&self,
tag: RelTag,
version: Version<'_>,
lsn: Lsn,
latest: bool,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
@@ -224,12 +220,12 @@ impl Timeline {
));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(nblocks);
}
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, version, latest, ctx).await?
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
{
// FIXME: Postgres sometimes calls smgrcreate() to create
// FSM, and smgrnblocks() on it immediately afterwards,
@@ -239,7 +235,7 @@ impl Timeline {
}
let key = rel_size_to_key(tag);
let mut buf = version.get(self, key, ctx).await?;
let mut buf = self.get(key, lsn, ctx).await?;
let nblocks = buf.get_u32_le();
if latest {
@@ -250,16 +246,16 @@ impl Timeline {
// latest=true, then it can not cause cache corruption, because with latest=true
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
// associated with most recent value of LSN.
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
self.update_cached_rel_size(tag, lsn, nblocks);
}
Ok(nblocks)
}
/// Does relation exist?
pub(crate) async fn get_rel_exists(
pub async fn get_rel_exists(
&self,
tag: RelTag,
version: Version<'_>,
lsn: Lsn,
_latest: bool,
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
@@ -270,12 +266,12 @@ impl Timeline {
}
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(true);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -291,16 +287,16 @@ impl Timeline {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub(crate) async fn list_rels(
pub async fn list_rels(
&self,
spcnode: Oid,
dbnode: Oid,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashSet<RelTag>, PageReconstructError> {
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -319,7 +315,7 @@ impl Timeline {
}
/// Look up given SLRU page version.
pub(crate) async fn get_slru_page_at_lsn(
pub async fn get_slru_page_at_lsn(
&self,
kind: SlruKind,
segno: u32,
@@ -332,29 +328,29 @@ impl Timeline {
}
/// Get size of an SLRU segment
pub(crate) async fn get_slru_segment_size(
pub async fn get_slru_segment_size(
&self,
kind: SlruKind,
segno: u32,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
let key = slru_segment_size_to_key(kind, segno);
let mut buf = version.get(self, key, ctx).await?;
let mut buf = self.get(key, lsn, ctx).await?;
Ok(buf.get_u32_le())
}
/// Get size of an SLRU segment
pub(crate) async fn get_slru_segment_exists(
pub async fn get_slru_segment_exists(
&self,
kind: SlruKind,
segno: u32,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
// fetch directory listing
let key = slru_dir_to_key(kind);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -372,7 +368,7 @@ impl Timeline {
/// so it's not well defined which LSN you get if there were multiple commits
/// "in flight" at that point in time.
///
pub(crate) async fn find_lsn_for_timestamp(
pub async fn find_lsn_for_timestamp(
&self,
search_timestamp: TimestampTz,
cancel: &CancellationToken,
@@ -452,7 +448,7 @@ impl Timeline {
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
/// with a smaller/larger timestamp.
///
pub(crate) async fn is_latest_commit_timestamp_ge_than(
pub async fn is_latest_commit_timestamp_ge_than(
&self,
search_timestamp: TimestampTz,
probe_lsn: Lsn,
@@ -475,7 +471,7 @@ impl Timeline {
/// Obtain the possible timestamp range for the given lsn.
///
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
pub(crate) async fn get_timestamp_for_lsn(
pub async fn get_timestamp_for_lsn(
&self,
probe_lsn: Lsn,
ctx: &RequestContext,
@@ -505,11 +501,11 @@ impl Timeline {
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
) -> Result<T, PageReconstructError> {
for segno in self
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
.await?
{
let nblocks = self
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
.await?;
for blknum in (0..nblocks).rev() {
let clog_page = self
@@ -532,36 +528,36 @@ impl Timeline {
}
/// Get a list of SLRU segments
pub(crate) async fn list_slru_segments(
pub async fn list_slru_segments(
&self,
kind: SlruKind,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashSet<u32>, PageReconstructError> {
// fetch directory entry
let key = slru_dir_to_key(kind);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.segments),
Err(e) => Err(PageReconstructError::from(e)),
}
}
pub(crate) async fn get_relmap_file(
pub async fn get_relmap_file(
&self,
spcnode: Oid,
dbnode: Oid,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
let key = relmap_file_key(spcnode, dbnode);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
Ok(buf)
}
pub(crate) async fn list_dbdirs(
pub async fn list_dbdirs(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -575,7 +571,7 @@ impl Timeline {
}
}
pub(crate) async fn get_twophase_file(
pub async fn get_twophase_file(
&self,
xid: TransactionId,
lsn: Lsn,
@@ -586,7 +582,7 @@ impl Timeline {
Ok(buf)
}
pub(crate) async fn list_twophase_files(
pub async fn list_twophase_files(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -600,7 +596,7 @@ impl Timeline {
}
}
pub(crate) async fn get_control_file(
pub async fn get_control_file(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -608,7 +604,7 @@ impl Timeline {
self.get(CONTROLFILE_KEY, lsn, ctx).await
}
pub(crate) async fn get_checkpoint(
pub async fn get_checkpoint(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -616,7 +612,7 @@ impl Timeline {
self.get(CHECKPOINT_KEY, lsn, ctx).await
}
pub(crate) async fn list_aux_files(
pub async fn list_aux_files(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -656,10 +652,7 @@ impl Timeline {
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
.await?
{
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
if self.cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled);
}
@@ -699,7 +692,7 @@ impl Timeline {
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
.list_rels(spcnode, dbnode, lsn, ctx)
.await?
.into_iter()
.collect();
@@ -806,39 +799,18 @@ pub struct DatadirModification<'a> {
/// in the state in 'tline' yet.
pub tline: &'a Timeline,
/// Current LSN of the modification
lsn: Lsn,
/// Lsn assigned by begin_modification
pub lsn: Lsn,
// The modifications are not applied directly to the underlying key-value store.
// The put-functions add the modifications here, and they are flushed to the
// underlying key-value store by the 'finish' function.
pending_lsns: Vec<Lsn>,
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
pending_deletions: Vec<(Range<Key>, Lsn)>,
pending_updates: HashMap<Key, Value>,
pending_deletions: Vec<Range<Key>>,
pending_nblocks: i64,
}
impl<'a> DatadirModification<'a> {
/// Get the current lsn
pub(crate) fn get_lsn(&self) -> Lsn {
self.lsn
}
/// Set the current lsn
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
ensure!(
lsn >= self.lsn,
"setting an older lsn {} than {} is not allowed",
lsn,
self.lsn
);
if lsn > self.lsn {
self.pending_lsns.push(self.lsn);
self.lsn = lsn;
}
Ok(())
}
/// Initialize a completely new repository.
///
/// This inserts the directory metadata entries that are assumed to
@@ -1012,9 +984,11 @@ impl<'a> DatadirModification<'a> {
dbnode: Oid,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let req_lsn = self.tline.get_last_record_lsn();
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
.await?;
// Remove entry from dbdir
@@ -1103,11 +1077,8 @@ impl<'a> DatadirModification<'a> {
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
if self
.tline
.get_rel_exists(rel, Version::Modified(self), true, ctx)
.await?
{
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
let size_key = rel_size_to_key(rel);
// Fetch the old size first
let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1352,23 +1323,17 @@ impl<'a> DatadirModification<'a> {
let writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
for (key, values) in self.pending_updates.drain() {
for (lsn, value) in values {
if is_rel_block_key(&key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, lsn, &value, ctx).await?;
} else {
retained_pending_updates
.entry(key)
.or_default()
.push((lsn, value));
}
let mut retained_pending_updates = HashMap::new();
for (key, value) in self.pending_updates.drain() {
if is_rel_block_key(&key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value, ctx).await?;
} else {
retained_pending_updates.insert(key, value);
}
}
self.pending_updates = retained_pending_updates;
self.pending_updates.extend(retained_pending_updates);
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1385,28 +1350,18 @@ impl<'a> DatadirModification<'a> {
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let writer = self.tline.writer().await;
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
if !self.pending_updates.is_empty() {
writer.put_batch(&self.pending_updates, ctx).await?;
self.pending_updates.clear();
for (key, value) in self.pending_updates.drain() {
writer.put(key, lsn, &value, ctx).await?;
}
for key_range in self.pending_deletions.drain(..) {
writer.delete(key_range, lsn).await?;
}
if !self.pending_deletions.is_empty() {
writer.delete_batch(&self.pending_deletions).await?;
self.pending_deletions.clear();
}
self.pending_lsns.push(self.lsn);
for pending_lsn in self.pending_lsns.drain(..) {
// Ideally, we should be able to call writer.finish_write() only once
// with the highest LSN. However, the last_record_lsn variable in the
// timeline keeps track of the latest LSN and the immediate previous LSN
// so we need to record every LSN to not leave a gap between them.
writer.finish_write(pending_lsn);
}
writer.finish_write(lsn);
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1415,86 +1370,44 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub(crate) fn len(&self) -> usize {
self.pending_updates.len() + self.pending_deletions.len()
pub(crate) fn is_empty(&self) -> bool {
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
}
// Internal helper functions to batch the modifications
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
// Have we already updated the same key? Read the latest pending updated
// Have we already updated the same key? Read the pending updated
// version in that case.
//
// Note: we don't check pending_deletions. It is an error to request a
// value that has been removed, deletion only avoids leaking storage.
if let Some(values) = self.pending_updates.get(&key) {
if let Some((_, value)) = values.last() {
return if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
)))
};
if let Some(value) = self.pending_updates.get(&key) {
if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
)))
}
} else {
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn, ctx).await
}
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn, ctx).await
}
fn put(&mut self, key: Key, val: Value) {
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn
if let Some((last_lsn, last_value)) = values.last_mut() {
if *last_lsn == self.lsn {
*last_value = val;
return;
}
}
values.push((self.lsn, val));
self.pending_updates.insert(key, val);
}
fn delete(&mut self, key_range: Range<Key>) {
trace!("DELETE {}-{}", key_range.start, key_range.end);
self.pending_deletions.push((key_range, self.lsn));
}
}
/// This struct facilitates accessing either a committed key from the timeline at a
/// specific LSN, or the latest uncommitted key from a pending modification.
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
/// need to look up the keys in the modification first before looking them up in the
/// timeline to not miss the latest updates.
#[derive(Clone, Copy)]
pub enum Version<'a> {
Lsn(Lsn),
Modified(&'a DatadirModification<'a>),
}
impl<'a> Version<'a> {
async fn get(
&self,
timeline: &Timeline,
key: Key,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
match self {
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
Version::Modified(modification) => modification.get(key, ctx).await,
}
}
fn get_lsn(&self) -> Lsn {
match self {
Version::Lsn(lsn) => *lsn,
Version::Modified(modification) => modification.lsn,
}
self.pending_deletions.push(key_range);
}
}

View File

@@ -23,7 +23,7 @@ impl Statvfs {
}
// NB: allow() because the block count type is u32 on macOS.
#[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
#[allow(clippy::useless_conversion)]
pub fn blocks(&self) -> u64 {
match self {
Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
@@ -32,7 +32,7 @@ impl Statvfs {
}
// NB: allow() because the block count type is u32 on macOS.
#[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
#[allow(clippy::useless_conversion)]
pub fn blocks_available(&self) -> u64 {
match self {
Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),

View File

@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
// else, but that has not been needed in a long time.
std::env::var("TOKIO_WORKER_THREADS")
.map(|s| s.parse::<usize>().unwrap())
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
});
#[derive(Debug, Clone, Copy)]
@@ -258,9 +258,6 @@ pub enum TaskKind {
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
/// See [`crate::tenant::secondary`].
SecondaryDownloads,
/// See [`crate::tenant::secondary`].
SecondaryUploads,

View File

@@ -12,7 +12,7 @@
//!
use anyhow::{bail, Context};
use camino::Utf8Path;
use camino::{Utf8Path, Utf8PathBuf};
use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::FutureExt;
@@ -33,7 +33,6 @@ use tracing::*;
use utils::backoff;
use utils::completion;
use utils::crashsafe::path_with_suffix_extension;
use utils::failpoint_support;
use utils::fs_ext;
use utils::sync::gate::Gate;
use utils::sync::gate::GateGuard;
@@ -56,7 +55,6 @@ use self::timeline::uninit::TimelineUninitMark;
use self::timeline::uninit::UninitializedTimeline;
use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use self::timeline::WaitLsnError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
@@ -130,13 +128,6 @@ macro_rules! pausable_failpoint {
.expect("spawn_blocking");
}
};
($name:literal, $cond:expr) => {
if cfg!(feature = "testing") {
if $cond {
pausable_failpoint!($name)
}
}
};
}
pub mod blob_io;
@@ -603,9 +594,10 @@ impl Tenant {
mode: SpawnMode,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
// TODO(sharding): make WalRedoManager shard-aware
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
conf,
tenant_shard_id,
tenant_shard_id.tenant_id,
)));
let TenantSharedResources {
@@ -898,7 +890,7 @@ impl Tenant {
) -> anyhow::Result<()> {
span::debug_assert_current_span_has_tenant_id();
failpoint_support::sleep_millis_async!("before-attaching-tenant");
crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
let preload = match preload {
Some(p) => p,
@@ -1010,7 +1002,7 @@ impl Tenant {
// IndexPart is the source of truth.
self.clean_up_timelines(&existent_timelines)?;
failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
info!("Done");
@@ -1152,9 +1144,10 @@ impl Tenant {
tenant_shard_id: TenantShardId,
reason: String,
) -> Arc<Tenant> {
// TODO(sharding): make WalRedoManager shard-aware
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
conf,
tenant_shard_id,
tenant_shard_id.tenant_id,
)));
Arc::new(Tenant::new(
TenantState::Broken {
@@ -1766,15 +1759,7 @@ impl Tenant {
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline
.wait_lsn(*lsn, ctx)
.await
.map_err(|e| match e {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
}
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
})?;
ancestor_timeline.wait_lsn(*lsn, ctx).await?;
}
self.branch_timeline(
@@ -2043,13 +2028,6 @@ impl Tenant {
// It's mesed up.
// we just ignore the failure to stop
// If we're still attaching, fire the cancellation token early to drop out: this
// will prevent us flushing, but ensures timely shutdown if some I/O during attach
// is very slow.
if matches!(self.current_state(), TenantState::Attaching) {
self.cancel.cancel();
}
match self.set_stopping(shutdown_progress, false, false).await {
Ok(()) => {}
Err(SetStoppingError::Broken) => {
@@ -2748,10 +2726,6 @@ impl Tenant {
"#
.to_string();
fail::fail_point!("tenant-config-before-write", |_| {
anyhow::bail!("tenant-config-before-write");
});
// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
@@ -2865,7 +2839,9 @@ impl Tenant {
}
};
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
crate::failpoint_support::sleep_millis_async!(
"gc_iteration_internal_after_getting_gc_timelines"
);
// If there is nothing to GC, we don't want any messages in the INFO log.
if !gc_timelines.is_empty() {
@@ -3668,6 +3644,140 @@ fn remove_timeline_and_uninit_mark(
Ok(())
}
pub(crate) async fn create_tenant_files(
conf: &'static PageServerConf,
location_conf: &LocationConf,
tenant_shard_id: &TenantShardId,
) -> anyhow::Result<Utf8PathBuf> {
let target_tenant_directory = conf.tenant_path(tenant_shard_id);
anyhow::ensure!(
!target_tenant_directory
.try_exists()
.context("check existence of tenant directory")?,
"tenant directory already exists",
);
let temporary_tenant_dir =
path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX);
debug!("Creating temporary directory structure in {temporary_tenant_dir}");
// top-level dir may exist if we are creating it through CLI
crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
format!("could not create temporary tenant directory {temporary_tenant_dir}")
})?;
let creation_result = try_create_target_tenant_dir(
conf,
location_conf,
tenant_shard_id,
&temporary_tenant_dir,
&target_tenant_directory,
)
.await;
if creation_result.is_err() {
error!(
"Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data"
);
if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
} else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
error!(
"Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
)
}
}
creation_result?;
Ok(target_tenant_directory)
}
async fn try_create_target_tenant_dir(
conf: &'static PageServerConf,
location_conf: &LocationConf,
tenant_shard_id: &TenantShardId,
temporary_tenant_dir: &Utf8Path,
target_tenant_directory: &Utf8Path,
) -> Result<(), anyhow::Error> {
let temporary_tenant_timelines_dir = rebase_directory(
&conf.timelines_path(tenant_shard_id),
target_tenant_directory,
temporary_tenant_dir,
)
.with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?;
let temporary_legacy_tenant_config_path = rebase_directory(
&conf.tenant_config_path(tenant_shard_id),
target_tenant_directory,
temporary_tenant_dir,
)
.with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
let temporary_tenant_config_path = rebase_directory(
&conf.tenant_location_config_path(tenant_shard_id),
target_tenant_directory,
temporary_tenant_dir,
)
.with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
Tenant::persist_tenant_config_at(
tenant_shard_id,
&temporary_tenant_config_path,
&temporary_legacy_tenant_config_path,
location_conf,
)
.await?;
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
format!(
"create tenant {} temporary timelines directory {}",
tenant_shard_id, temporary_tenant_timelines_dir,
)
})?;
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
});
// Make sure the current tenant directory entries are durable before renaming.
// Without this, a crash may reorder any of the directory entry creations above.
crashsafe::fsync(temporary_tenant_dir)
.with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?;
fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
format!(
"move tenant {} temporary directory {} into the permanent one {}",
tenant_shard_id, temporary_tenant_dir, target_tenant_directory
)
})?;
let target_dir_parent = target_tenant_directory.parent().with_context(|| {
format!(
"get tenant {} dir parent for {}",
tenant_shard_id, target_tenant_directory,
)
})?;
crashsafe::fsync(target_dir_parent).with_context(|| {
format!(
"fsync renamed directory's parent {} for tenant {}",
target_dir_parent, tenant_shard_id,
)
})?;
Ok(())
}
fn rebase_directory(
original_path: &Utf8Path,
base: &Utf8Path,
new_base: &Utf8Path,
) -> anyhow::Result<Utf8PathBuf> {
let relative_path = original_path.strip_prefix(base).with_context(|| {
format!(
"Failed to strip base prefix '{}' off path '{}'",
base, original_path
)
})?;
Ok(new_base.join(relative_path))
}
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
/// to get bootstrap data for timeline initialization.
async fn run_initdb(
@@ -3762,7 +3872,6 @@ pub async fn dump_layerfile_from_path(
#[cfg(test)]
pub(crate) mod harness {
use bytes::{Bytes, BytesMut};
use camino::Utf8PathBuf;
use once_cell::sync::OnceCell;
use pageserver_api::shard::ShardIndex;
use std::fs;
@@ -3830,6 +3939,8 @@ pub(crate) mod harness {
pub struct TenantHarness {
pub conf: &'static PageServerConf,
pub tenant_conf: TenantConf,
// TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id
pub(crate) tenant_id: TenantId,
pub tenant_shard_id: TenantShardId,
pub generation: Generation,
pub shard: ShardIndex,
@@ -3891,6 +4002,7 @@ pub(crate) mod harness {
Ok(Self {
conf,
tenant_conf,
tenant_id,
tenant_shard_id,
generation: Generation::new(0xdeadbeef),
shard: ShardIndex::unsharded(),

View File

@@ -46,8 +46,6 @@ pub mod defaults {
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]

View File

@@ -588,7 +588,7 @@ impl DeleteTenantFlow {
}
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
// This is unexpected: this secondary tenants should not have been created, and we
// are not in a position to shut it down from here.
tracing::warn!("Tenant transitioned to secondary mode while deleting!");

View File

@@ -35,7 +35,7 @@ use crate::tenant::config::{
};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
@@ -44,7 +44,6 @@ use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::secondary::SecondaryTenant;
use super::TenantSharedResources;
/// For a tenant that appears in TenantsMap, it may either be
@@ -58,7 +57,7 @@ use super::TenantSharedResources;
/// having a properly acquired generation (Secondary doesn't need a generation)
pub(crate) enum TenantSlot {
Attached(Arc<Tenant>),
Secondary(Arc<SecondaryTenant>),
Secondary,
/// In this state, other administrative operations acting on the TenantId should
/// block, or return a retry indicator equivalent to HTTP 503.
InProgress(utils::completion::Barrier),
@@ -68,7 +67,7 @@ impl std::fmt::Debug for TenantSlot {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
Self::Secondary(_) => write!(f, "Secondary"),
Self::Secondary => write!(f, "Secondary"),
Self::InProgress(_) => write!(f, "InProgress"),
}
}
@@ -79,7 +78,7 @@ impl TenantSlot {
fn get_attached(&self) -> Option<&Arc<Tenant>> {
match self {
Self::Attached(t) => Some(t),
Self::Secondary(_) => None,
Self::Secondary => None,
Self::InProgress(_) => None,
}
}
@@ -131,7 +130,7 @@ impl TenantsMap {
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
fn resolve_attached_shard(
fn resolve_shard(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
@@ -141,27 +140,25 @@ impl TenantsMap {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
// Ignore all slots that don't contain an attached tenant
let tenant = match &slot.1 {
TenantSlot::Attached(t) => t,
_ => continue,
};
match selector {
ShardSelector::First => return Some(*slot.0),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return Some(*slot.0)
}
ShardSelector::Page(key) => {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if let Some(tenant) = slot.1.get_attached() {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
}
} else {
continue;
}
}
_ => continue,
@@ -467,18 +464,12 @@ pub async fn init_tenant_mgr(
*gen
} else {
match &location_conf.mode {
LocationMode::Secondary(secondary_config) => {
LocationMode::Secondary(_) => {
// We do not require the control plane's permission for secondary mode
// tenants, because they do no remote writes and hence require no
// generation number
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
tenants.insert(
tenant_shard_id,
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
secondary_config,
)),
);
tenants.insert(tenant_shard_id, TenantSlot::Secondary);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
@@ -670,14 +661,8 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
total_attached += 1;
}
TenantSlot::Secondary(state) => {
// We don't need to wait for this individually per-tenant: the
// downloader task will be waited on eventually, this cancel
// is just to encourage it to drop out if it is doing work
// for this tenant right now.
state.cancel.cancel();
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
TenantSlot::Secondary => {
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
}
TenantSlot::InProgress(notify) => {
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -754,6 +739,45 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
// caller will log how long we took
}
pub(crate) async fn create_tenant(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_shard_id: TenantShardId,
generation: Generation,
resources: TenantSharedResources,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
let location_conf = LocationConf::attached_single(tenant_conf, generation);
info!("Creating tenant at location {location_conf:?}");
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
let shard_identity = location_conf.shard;
let created_tenant = tenant_spawn(
conf,
tenant_shard_id,
&tenant_path,
resources,
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
None,
&TENANTS,
SpawnMode::Create,
ctx,
)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
let created_tenant_id = created_tenant.tenant_id();
debug_assert_eq!(created_tenant_id, tenant_shard_id.tenant_id);
slot_guard.upsert(TenantSlot::Attached(created_tenant.clone()))?;
Ok(created_tenant)
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum SetNewTenantConfigError {
#[error(transparent)]
@@ -785,24 +809,6 @@ pub(crate) async fn set_new_tenant_config(
Ok(())
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum UpsertLocationError {
#[error("Bad config request: {0}")]
BadRequest(anyhow::Error),
#[error("Cannot change config in this state: {0}")]
Unavailable(#[from] TenantMapError),
#[error("Tenant is already being modified")]
InProgress,
#[error("Failed to flush: {0}")]
Flush(anyhow::Error),
#[error("Internal error: {0}")]
Other(#[from] anyhow::Error),
}
impl TenantManager {
/// Convenience function so that anyone with a TenantManager can get at the global configuration, without
/// having to pass it around everywhere as a separate object.
@@ -839,49 +845,27 @@ impl TenantManager {
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary(_)) => {
None | Some(TenantSlot::Secondary) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
}
}
pub(crate) fn get_secondary_tenant_shard(
&self,
tenant_shard_id: TenantShardId,
) -> Option<Arc<SecondaryTenant>> {
let locked = self.tenants.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.ok()
.flatten();
match peek_slot {
Some(TenantSlot::Secondary(s)) => Some(s.clone()),
_ => None,
}
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(crate) async fn upsert_location(
&self,
tenant_shard_id: TenantShardId,
new_location_config: LocationConf,
flush: Option<Duration>,
spawn_mode: SpawnMode,
ctx: &RequestContext,
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
) -> Result<(), anyhow::Error> {
debug_assert_current_span_has_tenant_id();
info!("configuring tenant location to state {new_location_config:?}");
enum FastPathModified {
Attached(Arc<Tenant>),
Secondary(Arc<SecondaryTenant>),
}
// Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
// then we do not need to set the slot to InProgress, we can just call into the
// existng tenant.
let fast_path_taken = {
let modify_tenant = {
let locked = self.tenants.read().unwrap();
let peek_slot =
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -891,24 +875,16 @@ impl TenantManager {
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(
AttachedTenantConf::try_from(new_location_config.clone())
.map_err(UpsertLocationError::BadRequest)?,
);
tenant.set_new_location_config(AttachedTenantConf::try_from(
new_location_config.clone(),
)?);
Some(FastPathModified::Attached(tenant.clone()))
Some(tenant.clone())
} else {
// Different generations, fall through to general case
None
}
}
(
LocationMode::Secondary(secondary_conf),
Some(TenantSlot::Secondary(secondary_tenant)),
) => {
secondary_tenant.set_config(secondary_conf);
Some(FastPathModified::Secondary(secondary_tenant.clone()))
}
_ => {
// Not an Attached->Attached transition, fall through to general case
None
@@ -917,107 +893,69 @@ impl TenantManager {
};
// Fast-path continued: having dropped out of the self.tenants lock, do the async
// phase of writing config and/or waiting for flush, before returning.
match fast_path_taken {
Some(FastPathModified::Attached(tenant)) => {
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await?;
// Transition to AttachedStale means we may well hold a valid generation
// still, and have been requested to go stale as part of a migration. If
// the caller set `flush`, then flush to remote storage.
if let LocationMode::Attached(AttachedLocationConfig {
generation: _,
attach_mode: AttachmentMode::Stale,
}) = &new_location_config.mode
{
if let Some(flush_timeout) = flush {
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
Ok(Err(e)) => {
return Err(UpsertLocationError::Flush(e));
}
Ok(Ok(_)) => return Ok(Some(tenant)),
Err(_) => {
tracing::warn!(
// phase of waiting for flush, before returning.
if let Some(tenant) = modify_tenant {
// Transition to AttachedStale means we may well hold a valid generation
// still, and have been requested to go stale as part of a migration. If
// the caller set `flush`, then flush to remote storage.
if let LocationMode::Attached(AttachedLocationConfig {
generation: _,
attach_mode: AttachmentMode::Stale,
}) = &new_location_config.mode
{
if let Some(flush_timeout) = flush {
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
Ok(Err(e)) => {
return Err(e);
}
Ok(Ok(_)) => return Ok(()),
Err(_) => {
tracing::warn!(
timeout_ms = flush_timeout.as_millis(),
"Timed out waiting for flush to remote storage, proceeding anyway."
)
}
}
}
}
}
return Ok(Some(tenant));
}
Some(FastPathModified::Secondary(_secondary_tenant)) => {
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await?;
return Ok(None);
}
None => {
// Proceed with the general case procedure, where we will shutdown & remove any existing
// slot contents and replace with a fresh one
}
};
return Ok(());
}
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
// InProgress value to the slot while we make whatever changes are required. The state for
// the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
// the state is ill-defined while we're in transition. Transitions are async, but fast: we do
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
.map_err(|e| match e {
TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
unreachable!("Called with mode Any")
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
self.resources.deletion_queue_client.flush_advisory()
}
TenantSlotError::InProgress => UpsertLocationError::InProgress,
TenantSlotError::MapState(s) => UpsertLocationError::Unavailable(s),
})?;
match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
self.resources.deletion_queue_client.flush_advisory()
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
}
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
}
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
slot_guard.drop_old_value().expect("We just shut it down");
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
state.shutdown().await;
}
Some(TenantSlot::InProgress(_)) => {
// This should never happen: acquire_slot should error out
// if the contents of a slot were InProgress.
return Err(UpsertLocationError::Other(anyhow::anyhow!(
"Acquired an InProgress slot, this is a bug."
)));
}
None => {
// Slot was vacant, nothing needs shutting down.
}
slot_guard.drop_old_value().expect("We just shut it down");
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
@@ -1035,12 +973,12 @@ impl TenantManager {
// Before activating either secondary or attached mode, persist the
// configuration, so that on restart we will re-attach (or re-start
// secondary) on the tenant.
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(secondary_config) => {
TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
}
LocationMode::Secondary(_) => TenantSlot::Secondary,
LocationMode::Attached(_attach_config) => {
let shard_identity = new_location_config.shard;
let tenant = tenant_spawn(
@@ -1052,7 +990,7 @@ impl TenantManager {
shard_identity,
None,
self.tenants,
spawn_mode,
SpawnMode::Normal,
ctx,
)?;
@@ -1060,20 +998,9 @@ impl TenantManager {
}
};
let attached_tenant = if let TenantSlot::Attached(tenant) = &new_slot {
Some(tenant.clone())
} else {
None
};
slot_guard.upsert(new_slot)?;
slot_guard.upsert(new_slot).map_err(|e| match e {
TenantSlotUpsertError::InternalError(e) => {
UpsertLocationError::Other(anyhow::anyhow!(e))
}
TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
})?;
Ok(attached_tenant)
Ok(())
}
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1164,30 +1091,6 @@ impl TenantManager {
.collect(),
}
}
// Do some synchronous work for all tenant slots in Secondary state. The provided
// callback should be small and fast, as it will be called inside the global
// TenantsMap lock.
pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
where
// TODO: let the callback return a hint to drop out of the loop early
F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
{
let locked = self.tenants.read().unwrap();
let map = match &*locked {
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
TenantsMap::Open(m) => m,
};
for (tenant_id, slot) in map {
if let TenantSlot::Secondary(state) = slot {
// Only expose secondary tenants that are not currently shutting down
if !state.cancel.is_cancelled() {
func(tenant_id, state)
}
}
}
}
pub(crate) async fn delete_tenant(
&self,
@@ -1302,7 +1205,7 @@ pub(crate) fn get_tenant(
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary(_)) => {
None | Some(TenantSlot::Secondary) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
}
@@ -1354,11 +1257,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
let locked = TENANTS.read().unwrap();
// Resolve TenantId to TenantShardId
let tenant_shard_id = locked
.resolve_attached_shard(&tenant_id, shard_selector)
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)))?;
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
)?;
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.map_err(GetTenantError::MapState)?;
@@ -1375,7 +1276,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
}
}
}
Some(TenantSlot::Secondary(_)) => {
Some(TenantSlot::Secondary) => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
tenant_id,
)))
@@ -1639,12 +1540,61 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
TenantSlot::Secondary(_) => None,
TenantSlot::Secondary => None,
TenantSlot::InProgress(_) => None,
})
.collect())
}
/// Execute Attach mgmt API command.
///
/// Downloading all the tenant data is performed in the background, this merely
/// spawns the background task and returns quickly.
pub(crate) async fn attach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
generation: Generation,
tenant_conf: TenantConfOpt,
resources: TenantSharedResources,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let location_conf = LocationConf::attached_single(tenant_conf, generation);
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let shard_identity = location_conf.shard;
let attached_tenant = tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir,
resources,
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
None,
&TENANTS,
SpawnMode::Normal,
ctx,
)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
let attached_tenant_id = attached_tenant.tenant_id();
if tenant_id != attached_tenant_id {
return Err(TenantMapInsertError::Other(anyhow::anyhow!(
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
)));
}
slot_guard.upsert(TenantSlot::Attached(attached_tenant))?;
Ok(())
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum TenantMapInsertError {
#[error(transparent)]
@@ -1658,7 +1608,7 @@ pub(crate) enum TenantMapInsertError {
/// Superset of TenantMapError: issues that can occur when acquiring a slot
/// for a particular tenant ID.
#[derive(Debug, thiserror::Error)]
pub(crate) enum TenantSlotError {
pub enum TenantSlotError {
/// When acquiring a slot with the expectation that the tenant already exists.
#[error("Tenant {0} not found")]
NotFound(TenantShardId),
@@ -1667,6 +1617,9 @@ pub(crate) enum TenantSlotError {
#[error("tenant {0} already exists, state: {1:?}")]
AlreadyExists(TenantShardId, TenantState),
#[error("tenant {0} already exists in but is not attached")]
Conflict(TenantShardId),
// Tried to read a slot that is currently being mutated by another administrative
// operation.
#[error("tenant has a state change in progress, try again later")]
@@ -1844,7 +1797,11 @@ impl SlotGuard {
fn old_value_is_shutdown(&self) -> bool {
match self.old_value.as_ref() {
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
Some(TenantSlot::Secondary) => {
// TODO: when adding secondary mode tenants, this will check for shutdown
// in the same way that we do for `Tenant` above
true
}
Some(TenantSlot::InProgress(_)) => {
// A SlotGuard cannot be constructed for a slot that was already InProgress
unreachable!()
@@ -2054,19 +2011,26 @@ where
let mut slot_guard =
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
// allow pageserver shutdown to await for our completion
let (_guard, progress) = completion::channel();
// The SlotGuard allows us to manipulate the Tenant object without fear of some
// concurrent API request doing something else for the same tenant ID.
let attached_tenant = match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
Some(TenantSlot::Attached(t)) => Some(t),
_ => None,
};
// allow pageserver shutdown to await for our completion
let (_guard, progress) = completion::channel();
// If the tenant was attached, shut it down gracefully. For secondary
// locations this part is not necessary
match &attached_tenant {
Some(attached_tenant) => {
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match tenant.shutdown(progress, freeze_and_flush).await {
match attached_tenant.shutdown(progress, freeze_and_flush).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2075,19 +2039,11 @@ where
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
}
}
Some(tenant)
}
Some(TenantSlot::Secondary(secondary_state)) => {
tracing::info!("Shutting down in secondary mode");
secondary_state.shutdown().await;
None
None => {
// Nothing to wait on when not attached, proceed.
}
Some(TenantSlot::InProgress(_)) => {
// Acquiring a slot guarantees its old value was not InProgress
unreachable!();
}
None => None,
};
}
match tenant_cleanup
.await

View File

@@ -229,7 +229,6 @@ use crate::{
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
},
TENANT_HEATMAP_BASENAME,
};
use utils::id::{TenantId, TimelineId};
@@ -819,25 +818,8 @@ impl RemoteTimelineClient {
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
) {
// Filter out any layers which were not created by this tenant shard. These are
// layers that originate from some ancestor shard after a split, and may still
// be referenced by other shards. We are free to delete them locally and remove
// them from our index (and would have already done so when we reach this point
// in the code), but we may not delete them remotely.
with_metadata.retain(|(name, meta)| {
let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
&& meta.shard.shard_count == self.tenant_shard_id.shard_count;
if !retain {
tracing::debug!(
"Skipping deletion of ancestor-shard layer {name}, from shard {}",
meta.shard
);
}
retain
});
for (name, meta) in &with_metadata {
info!(
"scheduling deletion of layer {}{} (shard {})",
@@ -1742,11 +1724,11 @@ pub fn remote_index_path(
.expect("Failed to construct path")
}
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
RemotePath::from_string(&format!(
"tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
))
.expect("Failed to construct path")
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name
@@ -1903,7 +1885,7 @@ mod tests {
fn span(&self) -> tracing::Span {
tracing::info_span!(
"test",
tenant_id = %self.harness.tenant_shard_id.tenant_id,
tenant_id = %self.harness.tenant_id,
timeline_id = %TIMELINE_ID
)
}

View File

@@ -1,48 +1,24 @@
mod downloader;
pub mod heatmap;
mod heatmap_uploader;
mod scheduler;
use std::sync::Arc;
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use self::{
downloader::{downloader_task, SecondaryDetail},
heatmap_uploader::heatmap_uploader_task,
};
use self::heatmap_uploader::heatmap_uploader_task;
use super::{config::SecondaryLocationConfig, mgr::TenantManager};
use super::mgr::TenantManager;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use utils::{completion::Barrier, sync::gate::Gate};
use utils::completion::Barrier;
enum DownloadCommand {
Download(TenantShardId),
}
enum UploadCommand {
Upload(TenantShardId),
}
impl UploadCommand {
fn get_tenant_shard_id(&self) -> &TenantShardId {
match self {
Self::Upload(id) => id,
}
}
}
impl DownloadCommand {
fn get_tenant_shard_id(&self) -> &TenantShardId {
match self {
Self::Download(id) => id,
}
}
}
struct CommandRequest<T> {
payload: T,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -52,73 +28,12 @@ struct CommandResponse {
result: anyhow::Result<()>,
}
// Whereas [`Tenant`] represents an attached tenant, this type represents the work
// we do for secondary tenant locations: where we are not serving clients or
// ingesting WAL, but we are maintaining a warm cache of layer files.
//
// This type is all about the _download_ path for secondary mode. The upload path
// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
//
// This structure coordinates TenantManager and SecondaryDownloader,
// so that the downloader can indicate which tenants it is currently
// operating on, and the manager can indicate when a particular
// secondary tenant should cancel any work in flight.
#[derive(Debug)]
pub(crate) struct SecondaryTenant {
/// Carrying a tenant shard ID simplifies callers such as the downloader
/// which need to organize many of these objects by ID.
tenant_shard_id: TenantShardId,
/// Cancellation token indicates to SecondaryDownloader that it should stop doing
/// any work for this tenant at the next opportunity.
pub(crate) cancel: CancellationToken,
pub(crate) gate: Gate,
detail: std::sync::Mutex<SecondaryDetail>,
}
impl SecondaryTenant {
pub(crate) fn new(
tenant_shard_id: TenantShardId,
config: &SecondaryLocationConfig,
) -> Arc<Self> {
Arc::new(Self {
tenant_shard_id,
// todo: shall we make this a descendent of the
// main cancellation token, or is it sufficient that
// on shutdown we walk the tenants and fire their
// individual cancellations?
cancel: CancellationToken::new(),
gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
})
}
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();
// Wait for any secondary downloader work to complete
self.gate.close().await;
}
pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
self.detail.lock().unwrap().config = config.clone();
}
fn get_tenant_shard_id(&self) -> &TenantShardId {
&self.tenant_shard_id
}
}
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
/// where we want to immediately upload/download for a particular tenant. In normal operation
/// uploads & downloads are autonomous and not driven by this interface.
pub struct SecondaryController {
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
}
impl SecondaryController {
@@ -148,13 +63,6 @@ impl SecondaryController {
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
.await
}
pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
self.dispatch(
&self.download_req_tx,
DownloadCommand::Download(tenant_shard_id),
)
.await
}
}
pub fn spawn_tasks(
@@ -163,37 +71,9 @@ pub fn spawn_tasks(
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> SecondaryController {
let mgr_clone = tenant_manager.clone();
let storage_clone = remote_storage.clone();
let cancel_clone = cancel.clone();
let bg_jobs_clone = background_jobs_can_start.clone();
let (download_req_tx, download_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryDownloads,
None,
None,
"secondary tenant downloads",
false,
async move {
downloader_task(
mgr_clone,
storage_clone,
download_req_rx,
bg_jobs_clone,
cancel_clone,
)
.await;
Ok(())
},
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryUploads,
@@ -209,26 +89,16 @@ pub fn spawn_tasks(
background_jobs_can_start,
cancel,
)
.await;
Ok(())
.await
},
);
SecondaryController {
download_req_tx,
upload_req_tx,
}
SecondaryController { upload_req_tx }
}
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
pub fn null_controller() -> SecondaryController {
let (download_req_tx, _download_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
let (upload_req_tx, _upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
SecondaryController {
upload_req_tx,
download_req_tx,
}
SecondaryController { upload_req_tx }
}

View File

@@ -1,800 +0,0 @@
use std::{
collections::{HashMap, HashSet},
pin::Pin,
str::FromStr,
sync::Arc,
time::{Duration, Instant, SystemTime},
};
use crate::{
config::PageServerConf,
metrics::SECONDARY_MODE,
tenant::{
config::SecondaryLocationConfig,
debug_assert_current_span_has_tenant_and_timeline_id,
remote_timeline_client::{
index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::LayerFileName,
tasks::{warn_when_period_overrun, BackgroundLoopKind},
},
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
};
use super::{
heatmap::HeatMapLayer,
scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
SecondaryTenant,
};
use crate::tenant::{
mgr::TenantManager,
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
};
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::{DownloadError, GenericRemoteStorage};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, Instrument};
use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
};
use super::{
heatmap::{HeatMapTenant, HeatMapTimeline},
CommandRequest, DownloadCommand,
};
/// For each tenant, how long must have passed since the last download_tenant call before
/// calling it again. This is approximately the time by which local data is allowed
/// to fall behind remote data.
///
/// TODO: this should just be a default, and the actual period should be controlled
/// via the heatmap itself
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
pub(super) async fn downloader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) {
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
let generator = SecondaryDownloader {
tenant_manager,
remote_storage,
};
let mut scheduler = Scheduler::new(generator, concurrency);
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("secondary_downloads"))
.await
}
struct SecondaryDownloader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
}
#[derive(Debug, Clone)]
pub(super) struct OnDiskState {
metadata: LayerFileMetadata,
access_time: SystemTime,
}
impl OnDiskState {
fn new(
_conf: &'static PageServerConf,
_tenant_shard_id: &TenantShardId,
_imeline_id: &TimelineId,
_ame: LayerFileName,
metadata: LayerFileMetadata,
access_time: SystemTime,
) -> Self {
Self {
metadata,
access_time,
}
}
}
#[derive(Debug, Clone, Default)]
pub(super) struct SecondaryDetailTimeline {
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
/// We remember when layers were evicted, to prevent re-downloading them.
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
}
/// This state is written by the secondary downloader, it is opaque
/// to TenantManager
#[derive(Debug)]
pub(super) struct SecondaryDetail {
pub(super) config: SecondaryLocationConfig,
last_download: Option<Instant>,
next_download: Option<Instant>,
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
}
/// Helper for logging SystemTime
fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
datetime.format("%d/%m/%Y %T")
}
impl SecondaryDetail {
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
Self {
config,
last_download: None,
next_download: None,
timelines: HashMap::new(),
}
}
}
struct PendingDownload {
secondary_state: Arc<SecondaryTenant>,
last_download: Option<Instant>,
target_time: Option<Instant>,
period: Option<Duration>,
}
impl scheduler::PendingJob for PendingDownload {
fn get_tenant_shard_id(&self) -> &TenantShardId {
self.secondary_state.get_tenant_shard_id()
}
}
struct RunningDownload {
barrier: Barrier,
}
impl scheduler::RunningJob for RunningDownload {
fn get_barrier(&self) -> Barrier {
self.barrier.clone()
}
}
struct CompleteDownload {
secondary_state: Arc<SecondaryTenant>,
completed_at: Instant,
}
impl scheduler::Completion for CompleteDownload {
fn get_tenant_shard_id(&self) -> &TenantShardId {
self.secondary_state.get_tenant_shard_id()
}
}
type Scheduler = TenantBackgroundJobs<
SecondaryDownloader,
PendingDownload,
RunningDownload,
CompleteDownload,
DownloadCommand,
>;
impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
for SecondaryDownloader
{
#[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
fn on_completion(&mut self, completion: CompleteDownload) {
let CompleteDownload {
secondary_state,
completed_at: _completed_at,
} = completion;
tracing::debug!("Secondary tenant download completed");
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
// take priority to run again.
let mut detail = secondary_state.detail.lock().unwrap();
detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
}
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
let mut result = SchedulingResult {
jobs: Vec::new(),
want_interval: None,
};
// Step 1: identify some tenants that we may work on
let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
self.tenant_manager
.foreach_secondary_tenants(|_id, secondary_state| {
tenants.push(secondary_state.clone());
});
// Step 2: filter out tenants which are not yet elegible to run
let now = Instant::now();
result.jobs = tenants
.into_iter()
.filter_map(|secondary_tenant| {
let (last_download, next_download) = {
let mut detail = secondary_tenant.detail.lock().unwrap();
if !detail.config.warm {
// Downloads are disabled for this tenant
detail.next_download = None;
return None;
}
if detail.next_download.is_none() {
// Initialize with a jitter: this spreads initial downloads on startup
// or mass-attach across our freshen interval.
let jittered_period =
rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
detail.next_download = Some(now.checked_add(jittered_period).expect(
"Using our constant, which is known to be small compared with clock range",
));
}
(detail.last_download, detail.next_download.unwrap())
};
if now < next_download {
Some(PendingDownload {
secondary_state: secondary_tenant,
last_download,
target_time: Some(next_download),
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
})
} else {
None
}
})
.collect();
// Step 3: sort by target execution time to run most urgent first.
result.jobs.sort_by_key(|j| j.target_time);
result
}
fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
let tenant_shard_id = command.get_tenant_shard_id();
let tenant = self
.tenant_manager
.get_secondary_tenant_shard(*tenant_shard_id);
let Some(tenant) = tenant else {
{
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
}
};
Ok(PendingDownload {
target_time: None,
period: None,
last_download: None,
secondary_state: tenant,
})
}
fn spawn(
&mut self,
job: PendingDownload,
) -> (
RunningDownload,
Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
) {
let PendingDownload {
secondary_state,
last_download,
target_time,
period,
} = job;
let (completion, barrier) = utils::completion::channel();
let remote_storage = self.remote_storage.clone();
let conf = self.tenant_manager.get_conf();
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
(RunningDownload { barrier }, Box::pin(async move {
let _completion = completion;
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
.download()
.await
{
Err(UpdateError::NoData) => {
tracing::info!("No heatmap found for tenant. This is fine if it is new.");
},
Err(UpdateError::NoSpace) => {
tracing::warn!("Insufficient space while downloading. Will retry later.");
}
Err(UpdateError::Cancelled) => {
tracing::debug!("Shut down while downloading");
},
Err(UpdateError::Deserialize(e)) => {
tracing::error!("Corrupt content while downloading tenant: {e}");
},
Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
tracing::error!("Error while downloading tenant: {e}");
},
Ok(()) => {}
};
// Irrespective of the result, we will reschedule ourselves to run after our usual period.
// If the job had a target execution time, we may check our final execution
// time against that for observability purposes.
if let (Some(target_time), Some(period)) = (target_time, period) {
// Only track execution lag if this isn't our first download: otherwise, it is expected
// that execution will have taken longer than our configured interval, for example
// when starting up a pageserver and
if last_download.is_some() {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = Instant::now().duration_since(target_time);
warn_when_period_overrun(
elapsed,
period,
BackgroundLoopKind::SecondaryDownload,
);
}
}
CompleteDownload {
secondary_state,
completed_at: Instant::now(),
}
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
}
}
/// This type is a convenience to group together the various functions involved in
/// freshening a secondary tenant.
struct TenantDownloader<'a> {
conf: &'static PageServerConf,
remote_storage: &'a GenericRemoteStorage,
secondary_state: &'a SecondaryTenant,
}
/// Errors that may be encountered while updating a tenant
#[derive(thiserror::Error, Debug)]
enum UpdateError {
#[error("No remote data found")]
NoData,
#[error("Insufficient local storage space")]
NoSpace,
#[error("Failed to download")]
DownloadError(DownloadError),
#[error(transparent)]
Deserialize(#[from] serde_json::Error),
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl From<DownloadError> for UpdateError {
fn from(value: DownloadError) -> Self {
match &value {
DownloadError::Cancelled => Self::Cancelled,
DownloadError::NotFound => Self::NoData,
_ => Self::DownloadError(value),
}
}
}
impl From<std::io::Error> for UpdateError {
fn from(value: std::io::Error) -> Self {
if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
UpdateError::NoSpace
} else {
// An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
UpdateError::Other(anyhow::anyhow!(value))
}
}
}
impl<'a> TenantDownloader<'a> {
fn new(
conf: &'static PageServerConf,
remote_storage: &'a GenericRemoteStorage,
secondary_state: &'a SecondaryTenant,
) -> Self {
Self {
conf,
remote_storage,
secondary_state,
}
}
async fn download(&self) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_id();
// For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
// cover our access to local storage.
let Ok(_guard) = self.secondary_state.gate.enter() else {
// Shutting down
return Ok(());
};
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
// Download the tenant's heatmap
let heatmap_bytes = tokio::select!(
bytes = self.download_heatmap() => {bytes?},
_ = self.secondary_state.cancel.cancelled() => return Ok(())
);
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
// Save the heatmap: this will be useful on restart, allowing us to reconstruct
// layer metadata without having to re-download it.
let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
let heatmap_path_bg = heatmap_path.clone();
tokio::task::spawn_blocking(move || {
tokio::runtime::Handle::current().block_on(async move {
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
})
})
.await
.expect("Blocking task is never aborted")
.maybe_fatal_err(&context_msg)?;
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
// Download the layers in the heatmap
for timeline in heatmap.timelines {
if self.secondary_state.cancel.is_cancelled() {
return Ok(());
}
let timeline_id = timeline.timeline_id;
self.download_timeline(timeline)
.instrument(tracing::info_span!(
"secondary_download_timeline",
tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
%timeline_id
))
.await?;
}
Ok(())
}
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
debug_assert_current_span_has_tenant_id();
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
// TODO: make download conditional on ETag having changed since last download
// (https://github.com/neondatabase/neon/issues/6199)
tracing::debug!("Downloading heatmap for secondary tenant",);
let heatmap_path = remote_heatmap_path(tenant_shard_id);
let heatmap_bytes = backoff::retry(
|| async {
let download = self
.remote_storage
.download(&heatmap_path)
.await
.map_err(UpdateError::from)?;
let mut heatmap_bytes = Vec::new();
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
Ok(heatmap_bytes)
},
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"download heatmap",
backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
UpdateError::Cancelled
}),
)
.await?;
SECONDARY_MODE.download_heatmap.inc();
Ok(heatmap_bytes)
}
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
let timeline_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id);
// Accumulate updates to the state
let mut touched = Vec::new();
// Clone a view of what layers already exist on disk
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
let layers_in_heatmap = timeline
.layers
.iter()
.map(|l| &l.name)
.collect::<HashSet<_>>();
let layers_on_disk = timeline_state
.on_disk_layers
.iter()
.map(|l| l.0)
.collect::<HashSet<_>>();
// Remove on-disk layers that are no longer present in heatmap
for layer in layers_on_disk.difference(&layers_in_heatmap) {
let local_path = timeline_path.join(layer.to_string());
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)
.maybe_fatal_err("Removing secondary layer")?;
}
// Download heatmap layers that are not present on local disk, or update their
// access time if they are already present.
for layer in timeline.layers {
if self.secondary_state.cancel.is_cancelled() {
return Ok(());
}
// Existing on-disk layers: just update their access time.
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
tracing::debug!("Layer {} is already on disk", layer.name);
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{
// We already have this layer on disk. Update its access time.
tracing::debug!(
"Access time updated for layer {}: {} -> {}",
layer.name,
strftime(&on_disk.access_time),
strftime(&layer.access_time)
);
touched.push(layer);
}
continue;
} else {
tracing::debug!("Layer {} not present on disk yet", layer.name);
}
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
// recently than it was evicted.
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
if &layer.access_time > evicted_at {
tracing::info!(
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
layer.name,
strftime(&layer.access_time),
strftime(evicted_at)
);
} else {
tracing::trace!(
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
layer.name,
strftime(&layer.access_time),
strftime(evicted_at)
);
continue;
}
}
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
let downloaded_bytes = match download_layer_file(
self.conf,
self.remote_storage,
*tenant_shard_id,
timeline.timeline_id,
&layer.name,
&LayerFileMetadata::from(&layer.metadata),
&self.secondary_state.cancel,
)
.await
{
Ok(bytes) => bytes,
Err(e) => {
if let DownloadError::NotFound = e {
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
// This is harmless: continue to download the next layer. It is expected during compaction
// GC.
tracing::debug!(
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
continue;
} else {
return Err(e.into());
}
}
};
if downloaded_bytes != layer.metadata.file_size {
let local_path = timeline_path.join(layer.name.to_string());
tracing::warn!(
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
layer.name,
downloaded_bytes,
layer.metadata.file_size
);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)?;
}
SECONDARY_MODE.download_layer.inc();
touched.push(layer)
}
// Write updates to state to record layers we just downloaded or touched.
{
let mut detail = self.secondary_state.detail.lock().unwrap();
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
for t in touched {
use std::collections::hash_map::Entry;
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
Entry::Occupied(mut v) => {
v.get_mut().access_time = t.access_time;
}
Entry::Vacant(e) => {
e.insert(OnDiskState::new(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
t.name,
LayerFileMetadata::from(&t.metadata),
t.access_time,
));
}
}
}
}
Ok(())
}
}
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
async fn init_timeline_state(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
heatmap: &HeatMapTimeline,
) -> SecondaryDetailTimeline {
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
let mut detail = SecondaryDetailTimeline::default();
let mut dir = match tokio::fs::read_dir(&timeline_path).await {
Ok(d) => d,
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
let context = format!("Creating timeline directory {timeline_path}");
tracing::info!("{}", context);
tokio::fs::create_dir_all(&timeline_path)
.await
.fatal_err(&context);
// No entries to report: drop out.
return detail;
} else {
on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
}
}
};
// As we iterate through layers found on disk, we will look up their metadata from this map.
// Layers not present in metadata will be discarded.
let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
while let Some(dentry) = dir
.next_entry()
.await
.fatal_err(&format!("Listing {timeline_path}"))
{
let dentry_file_name = dentry.file_name();
let file_name = dentry_file_name.to_string_lossy();
let local_meta = dentry.metadata().await.fatal_err(&format!(
"Read metadata on {}",
dentry.path().to_string_lossy()
));
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
if file_name == METADATA_FILE_NAME {
continue;
}
match LayerFileName::from_str(&file_name) {
Ok(name) => {
let remote_meta = heatmap_metadata.get(&name);
match remote_meta {
Some(remote_meta) => {
// TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
if local_meta.len() != remote_meta.metadata.file_size {
// This should not happen, because we do crashsafe write-then-rename when downloading
// layers, and layers in remote storage are immutable. Remove the local file because
// we cannot trust it.
tracing::warn!(
"Removing local layer {name} with unexpected local size {} != {}",
local_meta.len(),
remote_meta.metadata.file_size
);
} else {
// We expect the access time to be initialized immediately afterwards, when
// the latest heatmap is applied to the state.
detail.on_disk_layers.insert(
name.clone(),
OnDiskState::new(
conf,
tenant_shard_id,
&heatmap.timeline_id,
name,
LayerFileMetadata::from(&remote_meta.metadata),
remote_meta.access_time,
),
);
}
}
None => {
// FIXME: consider some optimization when transitioning from attached to secondary: maybe
// wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise
// we will end up deleting any layers which were created+uploaded more recently than the heatmap.
tracing::info!(
"Removing secondary local layer {} because it's absent in heatmap",
name
);
tokio::fs::remove_file(&dentry.path())
.await
.or_else(fs_ext::ignore_not_found)
.fatal_err(&format!(
"Removing layer {}",
dentry.path().to_string_lossy()
));
}
}
}
Err(_) => {
// Ignore it.
tracing::warn!("Unexpected file in timeline directory: {file_name}");
}
}
}
detail
}

View File

@@ -1,6 +1,5 @@
use std::{
collections::HashMap,
pin::Pin,
sync::{Arc, Weak},
time::{Duration, Instant},
};
@@ -8,86 +7,35 @@ use std::{
use crate::{
metrics::SECONDARY_MODE,
tenant::{
config::AttachmentMode,
mgr::TenantManager,
remote_timeline_client::remote_heatmap_path,
span::debug_assert_current_span_has_tenant_id,
tasks::{warn_when_period_overrun, BackgroundLoopKind},
Tenant,
config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
},
};
use futures::Future;
use md5;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::GenericRemoteStorage;
use super::{
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
CommandRequest,
};
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, Instrument};
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
use tracing::instrument;
use utils::{backoff, completion::Barrier};
use super::{heatmap::HeatMapTenant, UploadCommand};
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) {
let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
let generator = HeatmapUploader {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tenants: HashMap::new(),
};
let mut scheduler = Scheduler::new(generator, concurrency);
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("heatmap_uploader"))
.await
}
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
/// handling loop and mutates it as needed: there are no locks here, because that event loop
/// can hold &mut references to this type throughout.
struct HeatmapUploader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
tenants: HashMap<TenantShardId, UploaderTenantState>,
}
/// Period between heatmap uploader walking Tenants to look for work to do.
/// If any tenants have a heatmap upload period lower than this, it will be adjusted
/// downward to match.
const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
struct WriteInProgress {
barrier: Barrier,
}
impl RunningJob for WriteInProgress {
fn get_barrier(&self) -> Barrier {
self.barrier.clone()
}
}
struct UploadPending {
tenant: Arc<Tenant>,
last_digest: Option<md5::Digest>,
target_time: Option<Instant>,
period: Option<Duration>,
}
impl scheduler::PendingJob for UploadPending {
fn get_tenant_shard_id(&self) -> &TenantShardId {
self.tenant.get_tenant_shard_id()
}
}
struct WriteComplete {
@@ -97,12 +45,6 @@ struct WriteComplete {
next_upload: Option<Instant>,
}
impl scheduler::Completion for WriteComplete {
fn get_tenant_shard_id(&self) -> &TenantShardId {
&self.tenant_shard_id
}
}
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
/// when we last did a write. We only populate this after doing at least one
/// write for a tenant -- this avoids holding state for tenants that have
@@ -126,110 +68,267 @@ struct UploaderTenantState {
next_upload: Option<Instant>,
}
type Scheduler = TenantBackgroundJobs<
HeatmapUploader,
UploadPending,
WriteInProgress,
WriteComplete,
UploadCommand,
>;
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
/// handling loop and mutates it as needed: there are no locks here, because that event loop
/// can hold &mut references to this type throughout.
struct HeatmapUploader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
for HeatmapUploader
{
async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
tenants: HashMap<TenantShardId, UploaderTenantState>,
/// Tenants with work to do, for which tasks should be spawned as soon as concurrency
/// limits permit it.
tenants_pending: std::collections::VecDeque<UploadPending>,
/// Tenants for which a task in `tasks` has been spawned.
tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
tasks: JoinSet<()>,
/// Channel for our child tasks to send results to: we use a channel for results rather than
/// just getting task results via JoinSet because we need the channel's recv() "sleep until something
/// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
/// behavior.
task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
concurrent_uploads: usize,
scheduling_interval: Duration,
}
/// The uploader task runs a loop that periodically wakes up and schedules tasks for
/// tenants that require an upload, or handles any commands that have been sent into
/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we
/// spawn.
///
/// Scheduling iterations are somewhat infrequent. However, each one will enqueue
/// all tenants that require an upload, and in between scheduling iterations we will
/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
///
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
/// we might block waiting on a Tenant.
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
let mut uploader = HeatmapUploader {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tasks: JoinSet::new(),
tenants: HashMap::new(),
tenants_pending: std::collections::VecDeque::new(),
tenants_uploading: HashMap::new(),
task_result_tx: result_tx,
task_result_rx: result_rx,
concurrent_uploads,
scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
};
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
// Look for new work: this is relatively expensive because we have to go acquire the lock on
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
// require an upload.
uploader.schedule_iteration().await?;
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
// - Check our cancellation token
let next_scheduling_iteration = Instant::now()
.checked_add(uploader.scheduling_interval)
.unwrap_or_else(|| {
tracing::warn!(
"Scheduling interval invalid ({}s), running immediately!",
uploader.scheduling_interval.as_secs_f64()
);
Instant::now()
});
loop {
tokio::select! {
_ = cancel.cancelled() => {
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
tracing::info!("Heatmap uploader joining tasks");
while let Some(_r) = uploader.tasks.join_next().await {};
tracing::info!("Heatmap uploader terminating");
break;
},
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
break;},
cmd = command_queue.recv() => {
tracing::debug!("heatmap_uploader_task: woke for command queue");
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("Heatmap uploader terminating");
cancel.cancel();
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
uploader.handle_command(payload, response_tx);
},
_ = uploader.process_next_completion() => {
if !cancel.is_cancelled() {
uploader.spawn_pending();
}
}
}
}
}
Ok(())
}
impl HeatmapUploader {
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
// Cull any entries in self.tenants whose Arc<Tenant> is gone
self.tenants
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
// The priority order of previously scheduled work may be invalidated by current state: drop
// all pending work (it will be re-scheduled if still needed)
self.tenants_pending.clear();
// Used a fixed 'now' through the following loop, for efficiency and fairness.
let now = Instant::now();
let mut result = SchedulingResult {
jobs: Vec::new(),
want_interval: None,
};
// While iterating over the potentially-long list of tenants, we will periodically yield
// to avoid blocking executor.
const YIELD_ITERATIONS: usize = 1000;
// Iterate over tenants looking for work to do.
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
let period = match tenant.get_heatmap_period() {
None => {
// Heatmaps are disabled for this tenant
return;
}
Some(period) => {
// If any tenant has asked for uploads more frequent than our scheduling interval,
// reduce it to match so that we can keep up. This is mainly useful in testing, where
// we may set rather short intervals.
result.want_interval = match result.want_interval {
None => Some(period),
Some(existing) => Some(std::cmp::min(period, existing)),
};
period
}
};
// Stale attachments do not upload anything: if we are in this state, there is probably some
// other attachment in mode Single or Multi running on another pageserver, and we don't
// want to thrash and overwrite their heatmap uploads.
if tenant.get_attach_mode() == AttachmentMode::Stale {
return;
for (i, tenant) in tenants.into_iter().enumerate() {
// Process is shutting down, drop out
if self.cancel.is_cancelled() {
return Ok(());
}
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| {
let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
last_digest: None,
}
});
// Decline to do the upload if insufficient time has passed
if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
return;
// Skip tenants that already have a write in flight
if self
.tenants_uploading
.contains_key(tenant.get_tenant_shard_id())
{
continue;
}
let last_digest = state.last_digest;
result.jobs.push(UploadPending {
tenant,
last_digest,
target_time: state.next_upload,
period: Some(period),
});
})
.await
.ok();
self.maybe_schedule_upload(&now, tenant);
result
if i + 1 % YIELD_ITERATIONS == 0 {
tokio::task::yield_now().await;
}
}
// Spawn tasks for as many of our pending tenants as we can.
self.spawn_pending();
Ok(())
}
fn spawn(
&mut self,
job: UploadPending,
) -> (
WriteInProgress,
Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
) {
let UploadPending {
///
/// Cancellation: this method is cancel-safe.
async fn process_next_completion(&mut self) {
match self.task_result_rx.recv().await {
Some(r) => {
self.on_completion(r);
}
None => {
unreachable!("Result sender is stored on Self");
}
}
}
/// The 'maybe' refers to the tenant's state: whether it is configured
/// for heatmap uploads at all, and whether sufficient time has passed
/// since the last upload.
fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
match tenant.get_heatmap_period() {
None => {
// Heatmaps are disabled for this tenant
return;
}
Some(period) => {
// If any tenant has asked for uploads more frequent than our scheduling interval,
// reduce it to match so that we can keep up. This is mainly useful in testing, where
// we may set rather short intervals.
if period < self.scheduling_interval {
self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
}
}
}
// Stale attachments do not upload anything: if we are in this state, there is probably some
// other attachment in mode Single or Multi running on another pageserver, and we don't
// want to thrash and overwrite their heatmap uploads.
if tenant.get_attach_mode() == AttachmentMode::Stale {
return;
}
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(Instant::now()),
last_digest: None,
});
// Decline to do the upload if insufficient time has passed
if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
return;
}
let last_digest = state.last_digest;
self.tenants_pending.push_back(UploadPending {
tenant,
last_digest,
target_time,
period,
} = job;
})
}
fn spawn_pending(&mut self) {
while !self.tenants_pending.is_empty()
&& self.tenants_uploading.len() < self.concurrent_uploads
{
// unwrap: loop condition includes !is_empty()
let pending = self.tenants_pending.pop_front().unwrap();
self.spawn_upload(pending.tenant, pending.last_digest);
}
}
fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
let remote_storage = self.remote_storage.clone();
let (completion, barrier) = utils::completion::channel();
let tenant_shard_id = *tenant.get_tenant_shard_id();
(WriteInProgress { barrier }, Box::pin(async move {
let (completion, barrier) = utils::completion::channel();
let result_tx = self.task_result_tx.clone();
self.tasks.spawn(async move {
// Guard for the barrier in [`WriteInProgress`]
let _completion = completion;
@@ -263,47 +362,22 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
};
let now = Instant::now();
// If the job had a target execution time, we may check our final execution
// time against that for observability purposes.
if let (Some(target_time), Some(period)) = (target_time, period) {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = now.duration_since(target_time);
warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
}
let next_upload = tenant
.get_heatmap_period()
.and_then(|period| now.checked_add(period));
WriteComplete {
result_tx
.send(WriteComplete {
tenant_shard_id: *tenant.get_tenant_shard_id(),
completed_at: now,
digest,
next_upload,
}
}.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
}
})
.ok();
});
fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
let tenant_shard_id = command.get_tenant_shard_id();
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Starting heatmap write on command");
let tenant = self
.tenant_manager
.get_attached_tenant_shard(*tenant_shard_id, true)
.map_err(|e| anyhow::anyhow!(e))?;
Ok(UploadPending {
// Ignore our state for last digest: this forces an upload even if nothing has changed
last_digest: None,
tenant,
target_time: None,
period: None,
})
self.tenants_uploading
.insert(tenant_shard_id, WriteInProgress { barrier });
}
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -315,6 +389,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
digest,
next_upload,
} = completion;
self.tenants_uploading.remove(&tenant_shard_id);
use std::collections::hash_map::Entry;
match self.tenants.entry(tenant_shard_id) {
Entry::Vacant(_) => {
@@ -327,6 +402,69 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
}
}
}
fn handle_command(
&mut self,
command: UploadCommand,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
) {
match command {
UploadCommand::Upload(tenant_shard_id) => {
// If an upload was ongoing for this tenant, let it finish first.
let barrier = if let Some(writing_state) =
self.tenants_uploading.get(&tenant_shard_id)
{
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap write to complete");
writing_state.barrier.clone()
} else {
// Spawn the upload then immediately wait for it. This will block processing of other commands and
// starting of other background work.
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Starting heatmap write on command");
let tenant = match self
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, true)
{
Ok(t) => t,
Err(e) => {
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse {
result: Err(e.into()),
}));
return;
}
};
self.spawn_upload(tenant, None);
let writing_state = self
.tenants_uploading
.get(&tenant_shard_id)
.expect("We just inserted this");
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap upload to complete");
writing_state.barrier.clone()
};
// This task does no I/O: it only listens for a barrier's completion and then
// sends to the command response channel. It is therefore safe to spawn this without
// any gates/task_mgr hooks.
tokio::task::spawn(async move {
barrier.wait().await;
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Heatmap upload complete");
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse { result: Ok(()) }))
});
}
}
}
}
enum UploadHeatmapOutcome {
@@ -349,6 +487,7 @@ enum UploadHeatmapError {
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
/// of the object we would have uploaded.
#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
async fn upload_tenant_heatmap(
remote_storage: GenericRemoteStorage,
tenant: &Arc<Tenant>,

View File

@@ -1,359 +0,0 @@
use futures::Future;
use std::{
collections::HashMap,
marker::PhantomData,
pin::Pin,
time::{Duration, Instant},
};
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use utils::{completion::Barrier, yielding_loop::yielding_loop};
use super::{CommandRequest, CommandResponse};
/// Scheduling interval is the time between calls to JobGenerator::schedule.
/// When we schedule jobs, the job generator may provide a hint of its preferred
/// interval, which we will respect within these intervals.
const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
/// Scheduling helper for background work across many tenants.
///
/// Systems that need to run background work across many tenants may use this type
/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
/// implementation to provide the work to execute. This is a simple scheduler that just
/// polls the generator for outstanding work, replacing its queue of pending work with
/// what the generator yields on each call: the job generator can change its mind about
/// the order of jobs between calls. The job generator is notified when jobs complete,
/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
/// admin APIs).
///
/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
///
/// G: A JobGenerator that this scheduler will poll to find pending jobs
/// PJ: 'Pending Job': type for job descriptors that are ready to run
/// RJ: 'Running Job' type' for jobs that have been spawned
/// C : 'Completion' type that spawned jobs will send when they finish
/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
where
G: JobGenerator<PJ, RJ, C, CMD>,
C: Completion,
PJ: PendingJob,
RJ: RunningJob,
{
generator: G,
/// Ready to run. Will progress to `running` once concurrent limit is satisfied, or
/// be removed on next scheduling pass.
pending: std::collections::VecDeque<PJ>,
/// Tasks currently running in Self::tasks for these tenants. Check this map
/// before pushing more work into pending for the same tenant.
running: HashMap<TenantShardId, RJ>,
tasks: JoinSet<C>,
concurrency: usize,
/// How often we would like schedule_interval to be called.
pub(super) scheduling_interval: Duration,
_phantom: PhantomData<(PJ, RJ, C, CMD)>,
}
pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
where
C: Completion,
PJ: PendingJob,
RJ: RunningJob,
{
/// Called at each scheduling interval. Return a list of jobs to run, most urgent first.
///
/// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
/// Implementations should take care to yield the executor periodically if running
/// very long loops.
///
/// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
/// jobs is not drained by the next scheduling interval, pending jobs will be cleared
/// and re-generated.
async fn schedule(&mut self) -> SchedulingResult<PJ>;
/// Called when a pending job is ready to be run.
///
/// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
/// Called when a job previously spawned with spawn() transmits its completion
fn on_completion(&mut self, completion: C);
/// Called when a command is received. A job will be spawned immediately if the return
/// value is Some, ignoring concurrency limits and the pending queue.
fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
}
/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
pub(super) struct SchedulingResult<PJ> {
pub(super) jobs: Vec<PJ>,
/// The job generator would like to be called again this soon
pub(super) want_interval: Option<Duration>,
}
/// See [`TenantBackgroundJobs`].
pub(super) trait PendingJob {
fn get_tenant_shard_id(&self) -> &TenantShardId;
}
/// See [`TenantBackgroundJobs`].
pub(super) trait Completion: Send + 'static {
fn get_tenant_shard_id(&self) -> &TenantShardId;
}
/// See [`TenantBackgroundJobs`].
pub(super) trait RunningJob {
fn get_barrier(&self) -> Barrier;
}
impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
where
C: Completion,
PJ: PendingJob,
RJ: RunningJob,
G: JobGenerator<PJ, RJ, C, CMD>,
{
pub(super) fn new(generator: G, concurrency: usize) -> Self {
Self {
generator,
pending: std::collections::VecDeque::new(),
running: HashMap::new(),
tasks: JoinSet::new(),
concurrency,
scheduling_interval: MAX_SCHEDULING_INTERVAL,
_phantom: PhantomData,
}
}
pub(super) async fn run(
&mut self,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) {
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
// Look for new work: this is relatively expensive because we have to go acquire the lock on
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
// require an upload.
self.schedule_iteration(&cancel).await;
if cancel.is_cancelled() {
return;
}
// Schedule some work, if concurrency limit permits it
self.spawn_pending();
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
// - Check our cancellation token
let next_scheduling_iteration = Instant::now()
.checked_add(self.scheduling_interval)
.unwrap_or_else(|| {
tracing::warn!(
"Scheduling interval invalid ({}s)",
self.scheduling_interval.as_secs_f64()
);
// unwrap(): this constant is small, cannot fail to add to time unless
// we are close to the end of the universe.
Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
});
loop {
tokio::select! {
_ = cancel.cancelled() => {
tracing::info!("joining tasks");
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
// It is the callers responsibility to make sure that the tasks they scheduled
// respect an appropriate cancellation token, to shut down promptly. It is only
// safe to wait on joining these tasks because we can see the cancellation token
// has been set.
while let Some(_r) = self.tasks.join_next().await {}
tracing::info!("terminating on cancellation token.");
break;
},
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
tracing::debug!("woke for scheduling interval");
break;},
cmd = command_queue.recv() => {
tracing::debug!("woke for command queue");
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("terminating on command queue destruction");
cancel.cancel();
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
self.handle_command(payload, response_tx);
},
_ = async {
let completion = self.process_next_completion().await;
match completion {
Some(c) => {
self.generator.on_completion(c);
if !cancel.is_cancelled() {
self.spawn_pending();
}
},
None => {
// Nothing is running, so just wait: expect that this future
// will be dropped when something in the outer select! fires.
cancel.cancelled().await;
}
}
} => {}
}
}
}
}
fn do_spawn(&mut self, job: PJ) {
let tenant_shard_id = *job.get_tenant_shard_id();
let (in_progress, fut) = self.generator.spawn(job);
self.tasks.spawn(fut);
self.running.insert(tenant_shard_id, in_progress);
}
/// For all pending tenants that are elegible for execution, spawn their task.
///
/// Caller provides the spawn operation, we track the resulting execution.
fn spawn_pending(&mut self) {
while !self.pending.is_empty() && self.running.len() < self.concurrency {
// unwrap: loop condition includes !is_empty()
let pending = self.pending.pop_front().unwrap();
self.do_spawn(pending);
}
}
/// For administrative commands: skip the pending queue, ignore concurrency limits
fn spawn_now(&mut self, job: PJ) -> &RJ {
let tenant_shard_id = *job.get_tenant_shard_id();
self.do_spawn(job);
self.running
.get(&tenant_shard_id)
.expect("We just inserted this")
}
/// Wait until the next task completes, and handle its completion
///
/// Cancellation: this method is cancel-safe.
async fn process_next_completion(&mut self) -> Option<C> {
match self.tasks.join_next().await {
Some(r) => {
// We use a channel to drive completions, but also
// need to drain the JoinSet to avoid completed tasks
// accumulating. These calls are 1:1 because every task
// we spawn into this joinset submits is result to the channel.
let completion = r.expect("Panic in background task");
self.running.remove(completion.get_tenant_shard_id());
Some(completion)
}
None => {
// Nothing is running, so we have nothing to wait for. We may drop out: the
// main even loop will call us again after the next time it has run something.
None
}
}
}
/// Convert the command into a pending job, spawn it, and when the spawned
/// job completes, send the result down `response_tx`.
fn handle_command(
&mut self,
cmd: CMD,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
) {
let job = match self.generator.on_command(cmd) {
Ok(j) => j,
Err(e) => {
response_tx.send(CommandResponse { result: Err(e) }).ok();
return;
}
};
let tenant_shard_id = job.get_tenant_shard_id();
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
barrier
} else {
let running = self.spawn_now(job);
running.get_barrier().clone()
};
// This task does no I/O: it only listens for a barrier's completion and then
// sends to the command response channel. It is therefore safe to spawn this without
// any gates/task_mgr hooks.
tokio::task::spawn(async move {
barrier.wait().await;
response_tx.send(CommandResponse { result: Ok(()) }).ok();
});
}
fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
self.running.get(tenant_shard_id).map(|r| r.get_barrier())
}
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
///
/// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
///
/// This function resets the pending list: it is assumed that the caller may change their mind about
/// which tenants need work between calls to schedule_iteration.
async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
let SchedulingResult {
jobs,
want_interval,
} = self.generator.schedule().await;
// Adjust interval based on feedback from the job generator
if let Some(want_interval) = want_interval {
// Calculation uses second granularity: this scheduler is not intended for high frequency tasks
self.scheduling_interval = Duration::from_secs(std::cmp::min(
std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
MAX_SCHEDULING_INTERVAL.as_secs(),
));
}
// The priority order of previously scheduled work may be invalidated by current state: drop
// all pending work (it will be re-scheduled if still needed)
self.pending.clear();
// While iterating over the potentially-long list of tenants, we will periodically yield
// to avoid blocking executor.
yielding_loop(1000, cancel, jobs.into_iter(), |job| {
// Skip tenants that already have a write in flight
if !self.running.contains_key(job.get_tenant_shard_id()) {
self.pending.push_back(job);
}
})
.await
.ok();
}
}

View File

@@ -320,8 +320,8 @@ impl DeltaLayer {
.metadata()
.context("get file metadata to determine size")?;
// This function is never used for constructing layers in a running pageserver,
// so it does not need an accurate TenantShardId.
// TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
// we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
Ok(DeltaLayer {

View File

@@ -278,8 +278,8 @@ impl ImageLayer {
.metadata()
.context("get file metadata to determine size")?;
// This function is never used for constructing layers in a running pageserver,
// so it does not need an accurate TenantShardId.
// TODO(sharding): we should get TenantShardId from path.
// OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
Ok(ImageLayer {

View File

@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// while being able to use std::fmt::Write's methods
use std::fmt::Write as _;
use std::ops::Range;
use tokio::sync::{RwLock, RwLockWriteGuard};
use tokio::sync::RwLock;
use super::{DeltaLayerWriter, ResidentLayer};
@@ -246,43 +246,16 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub(crate) async fn put_value(
pub async fn put_value(
&self,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
}
pub(crate) async fn put_values(
&self,
values: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
for (key, vals) in values {
for (lsn, val) in vals {
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
.await?;
}
}
Ok(())
}
async fn put_value_locked(
&self,
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let inner: &mut _ = &mut *self.inner.write().await;
self.assert_writable();
let off = {
// Avoid doing allocations for "small" values.
@@ -291,7 +264,7 @@ impl InMemoryLayer {
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
buf.clear();
val.ser_into(&mut buf)?;
locked_inner
inner
.file
.write_blob(
&buf,
@@ -302,7 +275,7 @@ impl InMemoryLayer {
.await?
};
let vec_map = locked_inner.index.entry(key).or_default();
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
if old.is_some() {
// We already had an entry for this LSN. That's odd..
@@ -312,11 +285,13 @@ impl InMemoryLayer {
Ok(())
}
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
// TODO: Currently, we just leak the storage for any deleted keys
Ok(())
}
/// Make the layer non-writeable. Only call once.
/// Records the end_lsn for non-dropped layers.
/// `end_lsn` is exclusive
pub async fn freeze(&self, end_lsn: Lsn) {

View File

@@ -945,18 +945,8 @@ impl LayerInner {
Ok((Err(e), _permit)) => {
// sleep already happened in the spawned task, if it was not cancelled
let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
match e.downcast_ref::<remote_storage::DownloadError>() {
// If the download failed due to its cancellation token,
// propagate the cancellation error upstream.
Some(remote_storage::DownloadError::Cancelled) => {
Err(DownloadError::DownloadCancelled)
}
_ => {
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
Err(DownloadError::DownloadFailed)
}
}
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
Err(DownloadError::DownloadFailed)
}
Err(_gone) => Err(DownloadError::DownloadCancelled),
}
@@ -1128,7 +1118,6 @@ impl LayerInner {
tracing::info!("evicted layer after unknown residence period");
}
}
timeline.metrics.evictions.inc();
timeline
.metrics
.resident_physical_size_sub(self.desc.file_size);

View File

@@ -45,8 +45,6 @@ pub(crate) enum BackgroundLoopKind {
ConsumptionMetricsCollectMetrics,
ConsumptionMetricsSyntheticSizeWorker,
InitialLogicalSizeCalculation,
HeatmapUpload,
SecondaryDownload,
}
impl BackgroundLoopKind {
@@ -65,11 +63,6 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
.with_label_values(&[loop_kind.as_static_str()])
.guard();
pausable_failpoint!(
"initial-size-calculation-permit-pause",
loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
);
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
Ok(permit) => permit,
Err(_closed) => unreachable!("we never close the semaphore"),

View File

@@ -373,20 +373,15 @@ pub struct GcInfo {
}
/// An error happened in a get() operation.
#[derive(thiserror::Error, Debug)]
pub(crate) enum PageReconstructError {
#[derive(thiserror::Error)]
pub enum PageReconstructError {
#[error(transparent)]
Other(#[from] anyhow::Error),
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
/// The operation was cancelled
#[error("Cancelled")]
Cancelled,
/// The ancestor of this is being stopped
#[error("ancestor timeline {0} is being stopped")]
AncestorStopping(TimelineId),
/// An error happened replaying WAL records
@@ -407,6 +402,32 @@ enum FlushLayerError {
Other(#[from] anyhow::Error),
}
impl std::fmt::Debug for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
}
Self::WalRedo(err) => err.fmt(f),
}
}
}
impl std::fmt::Display for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
}
Self::WalRedo(err) => err.fmt(f),
}
}
}
#[derive(Clone, Copy)]
pub enum LogicalSizeCalculationCause {
Initial,
@@ -431,21 +452,6 @@ impl std::fmt::Debug for Timeline {
}
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum WaitLsnError {
// Called on a timeline which is shutting down
#[error("Shutdown")]
Shutdown,
// Called on an timeline not in active state or shutting down
#[error("Bad state (not active)")]
BadState,
// Timeout expired while waiting for LSN to catch up with goal.
#[error("{0}")]
Timeout(String),
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -480,7 +486,7 @@ impl Timeline {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub(crate) async fn get(
pub async fn get(
&self,
key: Key,
lsn: Lsn,
@@ -490,11 +496,6 @@ impl Timeline {
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
}
// This check is debug-only because of the cost of hashing, and because it's a double-check: we
// already checked the key against the shard_identity when looking up the Timeline from
// page_service.
debug_assert!(!self.shard_identity.is_key_disposable(&key));
// XXX: structured stats collection for layer eviction here.
trace!(
"get page request for {}@{} from task kind {:?}",
@@ -628,28 +629,24 @@ impl Timeline {
/// You should call this before any of the other get_* or list_* functions. Calling
/// those functions with an LSN that has been processed yet is an error.
///
pub(crate) async fn wait_lsn(
pub async fn wait_lsn(
&self,
lsn: Lsn,
_ctx: &RequestContext, /* Prepare for use by cancellation */
) -> Result<(), WaitLsnError> {
if self.cancel.is_cancelled() {
return Err(WaitLsnError::Shutdown);
} else if !self.is_active() {
return Err(WaitLsnError::BadState);
}
) -> anyhow::Result<()> {
anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
// This should never be called from the WAL receiver, because that could lead
// to a deadlock.
debug_assert!(
anyhow::ensure!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
"wait_lsn cannot be called in WAL receiver"
);
debug_assert!(
anyhow::ensure!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
"wait_lsn cannot be called in WAL receiver"
);
debug_assert!(
anyhow::ensure!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
"wait_lsn cannot be called in WAL receiver"
);
@@ -663,22 +660,18 @@ impl Timeline {
{
Ok(()) => Ok(()),
Err(e) => {
use utils::seqwait::SeqWaitError::*;
match e {
Shutdown => Err(WaitLsnError::Shutdown),
Timeout => {
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
drop(_timer);
let walreceiver_status = self.walreceiver_status();
Err(WaitLsnError::Timeout(format!(
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
drop(_timer);
let walreceiver_status = self.walreceiver_status();
Err(anyhow::Error::new(e).context({
format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
lsn,
self.get_last_record_lsn(),
self.get_disk_consistent_lsn(),
walreceiver_status,
)))
}
}
)
}))
}
}
}
@@ -1466,7 +1459,6 @@ impl Timeline {
max_lsn_wal_lag,
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
availability_zone: self.conf.availability_zone.clone(),
ingest_batch_size: self.conf.ingest_batch_size,
},
broker_client,
ctx,
@@ -2231,13 +2223,13 @@ impl Timeline {
return Err(layer_traversal_error(
if cfg!(test) {
format!(
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
"could not find data for key {} at LSN {}, for request at LSN {}\n{}",
key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
)
} else {
format!(
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
"could not find data for key {} at LSN {}, for request at LSN {}",
key, cont_lsn, request_lsn
)
},
traversal_path,
@@ -2297,12 +2289,11 @@ impl Timeline {
ancestor
.wait_lsn(timeline.ancestor_lsn, ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => PageReconstructError::Cancelled,
e @ WaitLsnError::BadState => {
PageReconstructError::Other(anyhow::anyhow!(e))
}
.with_context(|| {
format!(
"wait for lsn {} on ancestor timeline_id={}",
timeline.ancestor_lsn, ancestor.timeline_id
)
})?;
timeline_owned = ancestor;
@@ -2480,27 +2471,9 @@ impl Timeline {
Ok(())
}
async fn put_values(
&self,
values: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// Pick the first LSN in the batch to get the layer to write to.
for lsns in values.values() {
if let Some((lsn, _)) = lsns.first() {
let layer = self.get_layer_for_write(*lsn).await?;
layer.put_values(values, ctx).await?;
break;
}
}
Ok(())
}
async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
if let Some((_, lsn)) = tombstones.first() {
let layer = self.get_layer_for_write(*lsn).await?;
layer.put_tombstones(tombstones).await?;
}
async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
let layer = self.get_layer_for_write(lsn).await?;
layer.put_tombstone(key_range, lsn).await?;
Ok(())
}
@@ -3062,15 +3035,6 @@ impl Timeline {
for range in &partition.ranges {
let mut key = range.start;
while key < range.end {
if self.shard_identity.is_key_disposable(&key) {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
key = key.next();
continue;
}
let img = match self.get(key, lsn, ctx).await {
Ok(img) => img,
Err(err) => {
@@ -3097,7 +3061,6 @@ impl Timeline {
}
}
};
image_layer_writer.put_image(key, &img).await?;
key = key.next();
}
@@ -3131,13 +3094,11 @@ impl Timeline {
.await
.context("fsync of newly created layer files")?;
if !all_paths.is_empty() {
par_fsync::par_fsync_async(&[self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id)])
.await
.context("fsync of timeline dir")?;
}
par_fsync::par_fsync_async(&[self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id)])
.await
.context("fsync of timeline dir")?;
let mut guard = self.layers.write().await;
@@ -3670,15 +3631,7 @@ impl Timeline {
)))
});
if !self.shard_identity.is_key_disposable(&key) {
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
} else {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
}
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
if !new_layers.is_empty() {
fail_point!("after-timeline-compacted-first-L1");
@@ -4233,7 +4186,7 @@ impl Timeline {
.context("Failed to reconstruct a page image:")
{
Ok(img) => img,
Err(e) => return Err(PageReconstructError::WalRedo(e)),
Err(e) => return Err(PageReconstructError::from(e)),
};
if img.len() == page_cache::PAGE_SZ {
@@ -4576,16 +4529,8 @@ impl<'a> TimelineWriter<'a> {
self.tl.put_value(key, lsn, value, ctx).await
}
pub(crate) async fn put_batch(
&self,
batch: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.tl.put_values(batch, ctx).await
}
pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
self.tl.put_tombstones(batch).await
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
self.tl.put_tombstone(key_range, lsn).await
}
/// Track the end of the latest digested WAL record.
@@ -4596,11 +4541,11 @@ impl<'a> TimelineWriter<'a> {
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
/// the latest and can be read.
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
pub fn finish_write(&self, new_lsn: Lsn) {
self.tl.finish_write(new_lsn);
}
pub(crate) fn update_current_logical_size(&self, delta: i64) {
pub fn update_current_logical_size(&self, delta: i64) {
self.tl.update_current_logical_size(delta)
}
}

View File

@@ -58,7 +58,6 @@ pub struct WalReceiverConf {
pub max_lsn_wal_lag: NonZeroU64,
pub auth_token: Option<Arc<String>>,
pub availability_zone: Option<String>,
pub ingest_batch_size: u64,
}
pub struct WalReceiver {

View File

@@ -411,7 +411,6 @@ impl ConnectionManagerState {
let node_id = new_sk.safekeeper_id;
let connect_timeout = self.conf.wal_connect_timeout;
let ingest_batch_size = self.conf.ingest_batch_size;
let timeline = Arc::clone(&self.timeline);
let ctx = ctx.detached_child(
TaskKind::WalReceiverConnectionHandler,
@@ -431,7 +430,6 @@ impl ConnectionManagerState {
connect_timeout,
ctx,
node_id,
ingest_batch_size,
)
.await;
@@ -1337,7 +1335,7 @@ mod tests {
ConnectionManagerState {
id: TenantTimelineId {
tenant_id: harness.tenant_shard_id.tenant_id,
tenant_id: harness.tenant_id,
timeline_id: TIMELINE_ID,
},
timeline,
@@ -1347,7 +1345,6 @@ mod tests {
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
auth_token: None,
availability_zone: None,
ingest_batch_size: 1,
},
wal_connection: None,
wal_stream_candidates: HashMap::new(),

View File

@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
@@ -106,7 +106,6 @@ impl From<WalDecodeError> for WalReceiverError {
/// Open a connection to the given safekeeper and receive WAL, sending back progress
/// messages as we go.
#[allow(clippy::too_many_arguments)]
pub(super) async fn handle_walreceiver_connection(
timeline: Arc<Timeline>,
wal_source_connconf: PgConnectionConfig,
@@ -115,7 +114,6 @@ pub(super) async fn handle_walreceiver_connection(
connect_timeout: Duration,
ctx: RequestContext,
node: NodeId,
ingest_batch_size: u64,
) -> Result<(), WalReceiverError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -307,9 +305,7 @@ pub(super) async fn handle_walreceiver_connection(
{
let mut decoded = DecodedWALRecord::default();
let mut modification = timeline.begin_modification(startlsn);
let mut uncommitted_records = 0;
let mut filtered_records = 0;
let mut modification = timeline.begin_modification(endlsn);
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
@@ -318,40 +314,14 @@ pub(super) async fn handle_walreceiver_connection(
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
}
// Ingest the records without immediately committing them.
let ingested = walingest
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
.await
.with_context(|| format!("could not ingest record at {lsn}"))?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
fail_point!("walreceiver-after-ingest");
last_rec_lsn = lsn;
// Commit every ingest_batch_size records. Even if we filtered out
// all records, we still need to call commit to advance the LSN.
uncommitted_records += 1;
if uncommitted_records >= ingest_batch_size {
WAL_INGEST
.records_committed
.inc_by(uncommitted_records - filtered_records);
modification.commit(&ctx).await?;
uncommitted_records = 0;
filtered_records = 0;
}
}
// Commit the remaining records.
if uncommitted_records > 0 {
WAL_INGEST
.records_committed
.inc_by(uncommitted_records - filtered_records);
modification.commit(&ctx).await?;
}
}

View File

@@ -18,8 +18,7 @@ use std::fs::{self, File, OpenOptions};
use std::io::{Error, ErrorKind, Seek, SeekFrom};
use std::os::unix::fs::FileExt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
use std::sync::{RwLock, RwLockWriteGuard};
use utils::fs_ext;
///
@@ -112,7 +111,7 @@ impl OpenFiles {
///
/// On return, we hold a lock on the slot, and its 'tag' has been updated
/// recently_used has been set. It's all ready for reuse.
async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
//
// Run the clock algorithm to find a slot to replace.
//
@@ -144,7 +143,7 @@ impl OpenFiles {
}
retries += 1;
} else {
slot_guard = slot.inner.write().await;
slot_guard = slot.inner.write().unwrap();
index = next;
break;
}
@@ -251,29 +250,6 @@ impl<T> MaybeFatalIo<T> for std::io::Result<T> {
}
}
/// Observe duration for the given storage I/O operation
///
/// Unlike `observe_closure_duration`, this supports async,
/// where "support" means that we measure wall clock time.
macro_rules! observe_duration {
($op:expr, $($body:tt)*) => {{
let instant = Instant::now();
let result = $($body)*;
let elapsed = instant.elapsed().as_secs_f64();
STORAGE_IO_TIME_METRIC
.get($op)
.observe(elapsed);
result
}}
}
macro_rules! with_file {
($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
let $ident = $this.lock_file().await?;
observe_duration!($op, $($body)*)
}};
}
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
@@ -310,12 +286,14 @@ impl VirtualFile {
tenant_id = "*".to_string();
timeline_id = "*".to_string();
}
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
// where our caller doesn't get to use the returned VirtualFile before its
// slot gets re-used by someone else.
let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Open)
.observe_closure_duration(|| open_options.open(path))?;
// Strip all options other than read and write.
//
@@ -388,24 +366,22 @@ impl VirtualFile {
/// Call File::sync_all() on the underlying File.
pub async fn sync_all(&self) -> Result<(), Error> {
with_file!(self, StorageIoOperation::Fsync, |file| file
.as_ref()
.sync_all())
self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
.await?
}
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
with_file!(self, StorageIoOperation::Metadata, |file| file
.as_ref()
.metadata())
self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
.await?
}
/// Helper function internal to `VirtualFile` that looks up the underlying File,
/// opens it and evicts some other File if necessary. The passed parameter is
/// assumed to be a function available for the physical `File`.
///
/// We are doing it via a macro as Rust doesn't support async closures that
/// take on parameters with lifetimes.
async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
/// Helper function that looks up the underlying File for this VirtualFile,
/// opening it and evicting some other File if necessary. It calls 'func'
/// with the physical File.
async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
where
F: FnMut(&File) -> R,
{
let open_files = get_open_files();
let mut handle_guard = {
@@ -415,23 +391,27 @@ impl VirtualFile {
// We only need to hold the handle lock while we read the current handle. If
// another thread closes the file and recycles the slot for a different file,
// we will notice that the handle we read is no longer valid and retry.
let mut handle = *self.handle.read().await;
let mut handle = *self.handle.read().unwrap();
loop {
// Check if the slot contains our File
{
let slot = &open_files.slots[handle.index];
let slot_guard = slot.inner.read().await;
if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
// Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed);
return Ok(FileGuard { slot_guard });
let slot_guard = slot.inner.read().unwrap();
if slot_guard.tag == handle.tag {
if let Some(file) = &slot_guard.file {
// Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed);
return Ok(STORAGE_IO_TIME_METRIC
.get(op)
.observe_closure_duration(|| func(file)));
}
}
}
// The slot didn't contain our File. We will have to open it ourselves,
// but before that, grab a write lock on handle in the VirtualFile, so
// that no other thread will try to concurrently open the same file.
let handle_guard = self.handle.write().await;
let handle_guard = self.handle.write().unwrap();
// If another thread changed the handle while we were not holding the lock,
// then the handle might now be valid again. Loop back to retry.
@@ -445,16 +425,20 @@ impl VirtualFile {
// We need to open the file ourselves. The handle in the VirtualFile is
// now locked in write-mode. Find a free slot to put it in.
let (handle, mut slot_guard) = open_files.find_victim_slot().await;
let (handle, mut slot_guard) = open_files.find_victim_slot();
// Re-open the physical file.
// NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
// case from StorageIoOperation::Open. This helps with identifying thrashing
// of the virtual file descriptor cache.
let file = observe_duration!(
StorageIoOperation::OpenAfterReplace,
self.open_options.open(&self.path)
)?;
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::OpenAfterReplace)
.observe_closure_duration(|| self.open_options.open(&self.path))?;
// Perform the requested operation on it
let result = STORAGE_IO_TIME_METRIC
.get(op)
.observe_closure_duration(|| func(&file));
// Store the File in the slot and update the handle in the VirtualFile
// to point to it.
@@ -462,9 +446,7 @@ impl VirtualFile {
*handle_guard = handle;
return Ok(FileGuard {
slot_guard: slot_guard.downgrade(),
});
Ok(result)
}
pub fn remove(self) {
@@ -479,9 +461,11 @@ impl VirtualFile {
self.pos = offset;
}
SeekFrom::End(offset) => {
self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
.as_ref()
.seek(SeekFrom::End(offset)))?
self.pos = self
.with_file(StorageIoOperation::Seek, |mut file| {
file.seek(SeekFrom::End(offset))
})
.await??
}
SeekFrom::Current(offset) => {
let pos = self.pos as i128 + offset as i128;
@@ -569,9 +553,9 @@ impl VirtualFile {
}
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Read, |file| file
.as_ref()
.read_at(buf, offset));
let result = self
.with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
.await?;
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -581,9 +565,9 @@ impl VirtualFile {
}
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Write, |file| file
.as_ref()
.write_at(buf, offset));
let result = self
.with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
.await?;
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -593,18 +577,6 @@ impl VirtualFile {
}
}
struct FileGuard<'a> {
slot_guard: RwLockReadGuard<'a, SlotInner>,
}
impl<'a> AsRef<File> for FileGuard<'a> {
fn as_ref(&self) -> &File {
// This unwrap is safe because we only create `FileGuard`s
// if we know that the file is Some.
self.slot_guard.file.as_ref().unwrap()
}
}
#[cfg(test)]
impl VirtualFile {
pub(crate) async fn read_blk(
@@ -637,41 +609,22 @@ impl VirtualFile {
impl Drop for VirtualFile {
/// If a VirtualFile is dropped, close the underlying file if it was open.
fn drop(&mut self) {
let handle = self.handle.get_mut();
let handle = self.handle.get_mut().unwrap();
fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
if slot_guard.tag == tag {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
if let Some(fd) = slot_guard.file.take() {
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(fd));
}
// We could check with a read-lock first, to avoid waiting on an
// unrelated I/O.
let slot = &get_open_files().slots[handle.index];
let mut slot_guard = slot.inner.write().unwrap();
if slot_guard.tag == handle.tag {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
if let Some(fd) = slot_guard.file.take() {
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(fd));
}
}
// We don't have async drop so we cannot directly await the lock here.
// Instead, first do a best-effort attempt at closing the underlying
// file descriptor by using `try_write`, and if that fails, spawn
// a tokio task to do it asynchronously: we just want it to be
// cleaned up eventually.
// Most of the time, the `try_lock` should succeed though,
// as we have `&mut self` access. In other words, if the slot
// is still occupied by our file, there should be no access from
// other I/O operations; the only other possible place to lock
// the slot is the lock algorithm looking for free slots.
let slot = &get_open_files().slots[handle.index];
if let Ok(slot_guard) = slot.inner.try_write() {
clean_slot(slot, slot_guard, handle.tag);
} else {
let tag = handle.tag;
tokio::spawn(async move {
let slot_guard = slot.inner.write().await;
clean_slot(slot, slot_guard, tag);
});
};
}
}

View File

@@ -29,7 +29,6 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
use anyhow::{bail, Context, Result};
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use utils::failpoint_support;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
@@ -48,18 +47,20 @@ use postgres_ffi::TransactionId;
use postgres_ffi::BLCKSZ;
use utils::lsn::Lsn;
pub struct WalIngest {
pub struct WalIngest<'a> {
shard: ShardIdentity,
timeline: &'a Timeline,
checkpoint: CheckPoint,
checkpoint_modified: bool,
}
impl WalIngest {
impl<'a> WalIngest<'a> {
pub async fn new(
timeline: &Timeline,
timeline: &'a Timeline,
startpoint: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<WalIngest> {
ctx: &'_ RequestContext,
) -> anyhow::Result<WalIngest<'a>> {
// Fetch the latest checkpoint into memory, so that we can compare with it
// quickly in `ingest_record` and update it when it changes.
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -68,6 +69,7 @@ impl WalIngest {
Ok(WalIngest {
shard: *timeline.get_shard_identity(),
timeline,
checkpoint,
checkpoint_modified: false,
})
@@ -81,8 +83,6 @@ impl WalIngest {
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
/// relations/pages that the record affects.
///
/// This function returns `true` if the record was ingested, and `false` if it was filtered out
///
pub async fn ingest_record(
&mut self,
recdata: Bytes,
@@ -90,13 +90,11 @@ impl WalIngest {
modification: &mut DatadirModification<'_>,
decoded: &mut DecodedWALRecord,
ctx: &RequestContext,
) -> anyhow::Result<bool> {
) -> anyhow::Result<()> {
WAL_INGEST.records_received.inc();
let pg_version = modification.tline.pg_version;
let prev_len = modification.len();
modification.set_lsn(lsn)?;
decode_wal_record(recdata, decoded, pg_version)?;
modification.lsn = lsn;
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
@@ -133,9 +131,9 @@ impl WalIngest {
}
pg_constants::RM_DBASE_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
debug!(%info, %pg_version, "handle RM_DBASE_ID");
debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
if pg_version == 14 {
if self.timeline.pg_version == 14 {
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
let createdb = XlCreateDatabase::decode(&mut buf);
debug!("XLOG_DBASE_CREATE v14");
@@ -151,7 +149,7 @@ impl WalIngest {
.await?;
}
}
} else if pg_version == 15 {
} else if self.timeline.pg_version == 15 {
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -171,7 +169,7 @@ impl WalIngest {
.await?;
}
}
} else if pg_version == 16 {
} else if self.timeline.pg_version == 16 {
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -346,7 +344,9 @@ impl WalIngest {
// particular point in the WAL. For more fine-grained control,
// we could peek into the message and only pause if it contains
// a particular string, for example, but this is enough for now.
failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
crate::failpoint_support::sleep_millis_async!(
"wal-ingest-logical-message-sleep"
);
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
modification.put_file(path, message, ctx).await?;
}
@@ -400,11 +400,19 @@ impl WalIngest {
self.checkpoint_modified = false;
}
// Note that at this point this record is only cached in the modification
// until commit() is called to flush the data into the repository and update
// the latest LSN.
if modification.is_empty() {
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
WAL_INGEST.records_filtered.inc();
modification.tline.finish_write(lsn);
} else {
WAL_INGEST.records_committed.inc();
modification.commit(ctx).await?;
}
Ok(modification.len() > prev_len)
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN.
Ok(())
}
/// Do not store this block, but observe it for the purposes of updating our relation size state.
@@ -451,7 +459,7 @@ impl WalIngest {
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
{
@@ -504,7 +512,7 @@ impl WalIngest {
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
match modification.tline.pg_version {
match self.timeline.pg_version {
14 => {
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -728,7 +736,7 @@ impl WalIngest {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -809,11 +817,10 @@ impl WalIngest {
let mut new_heap_blkno: Option<u32> = None;
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
let pg_version = modification.tline.pg_version;
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
match pg_version {
match self.timeline.pg_version {
16 => {
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -876,7 +883,7 @@ impl WalIngest {
}
_ => bail!(
"Neon RMGR has no known compatibility with PostgreSQL version {}",
pg_version
self.timeline.pg_version
),
}
@@ -899,7 +906,7 @@ impl WalIngest {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -977,14 +984,16 @@ impl WalIngest {
let src_db_id = rec.src_db_id;
let src_tablespace_id = rec.src_tablespace_id;
// Creating a database is implemented by copying the template (aka. source) database.
// To copy all the relations, we need to ask for the state as of the same LSN, but we
// cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
// the last valid LSN to advance up to it. So we use the previous record's LSN in the
// get calls instead.
let req_lsn = modification.tline.get_last_record_lsn();
let rels = modification
.tline
.list_rels(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
.await?;
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -992,12 +1001,7 @@ impl WalIngest {
// Copy relfilemap
let filemap = modification
.tline
.get_relmap_file(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
.await?;
modification
.put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1011,7 +1015,7 @@ impl WalIngest {
let nblocks = modification
.tline
.get_rel_size(src_rel, Version::Modified(modification), true, ctx)
.get_rel_size(src_rel, req_lsn, true, ctx)
.await?;
let dst_rel = RelTag {
spcnode: tablespace_id,
@@ -1029,13 +1033,7 @@ impl WalIngest {
let content = modification
.tline
.get_rel_page_at_lsn(
src_rel,
blknum,
Version::Modified(modification),
true,
ctx,
)
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
.await?;
modification.put_rel_page_image(dst_rel, blknum, content)?;
num_blocks_copied += 1;
@@ -1106,7 +1104,7 @@ impl WalIngest {
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
fsm_physical_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1128,7 +1126,7 @@ impl WalIngest {
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
vm_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1201,9 +1199,10 @@ impl WalIngest {
dbnode: xnode.dbnode,
relnode: xnode.relnode,
};
let last_lsn = self.timeline.get_last_record_lsn();
if modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.get_rel_exists(rel, last_lsn, true, ctx)
.await?
{
self.put_rel_drop(modification, rel, ctx).await?;
@@ -1257,9 +1256,10 @@ impl WalIngest {
// will block waiting for the last valid LSN to advance up to
// it. So we use the previous record's LSN in the get calls
// instead.
let req_lsn = modification.tline.get_last_record_lsn();
for segno in modification
.tline
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
.list_slru_segments(SlruKind::Clog, req_lsn, ctx)
.await?
{
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1471,6 +1471,20 @@ impl WalIngest {
Ok(())
}
async fn get_relsize(
&mut self,
rel: RelTag,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<BlockNumber> {
let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
0
} else {
self.timeline.get_rel_size(rel, lsn, true, ctx).await?
};
Ok(nblocks)
}
async fn handle_rel_extend(
&mut self,
modification: &mut DatadirModification<'_>,
@@ -1482,6 +1496,7 @@ impl WalIngest {
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = modification.lsn;
// Get current size and put rel creation if rel doesn't exist
//
@@ -1489,14 +1504,11 @@ impl WalIngest {
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
let old_nblocks = if let Some(nblocks) = modification
.tline
.get_cached_rel_size(&rel, modification.get_lsn())
{
let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
nblocks
} else if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
} else if !self
.timeline
.get_rel_exists(rel, last_lsn, true, ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
@@ -1506,10 +1518,7 @@ impl WalIngest {
.context("Relation Error")?;
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
};
if new_nblocks > old_nblocks {
@@ -1562,9 +1571,10 @@ impl WalIngest {
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let old_nblocks = if !modification
.tline
.get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
let last_lsn = self.timeline.get_last_record_lsn();
let old_nblocks = if !self
.timeline
.get_slru_segment_exists(kind, segno, last_lsn, ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
@@ -1573,9 +1583,8 @@ impl WalIngest {
.await?;
0
} else {
modification
.tline
.get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
self.timeline
.get_slru_segment_size(kind, segno, last_lsn, ctx)
.await?
};
@@ -1598,26 +1607,6 @@ impl WalIngest {
}
}
async fn get_relsize(
modification: &DatadirModification<'_>,
rel: RelTag,
ctx: &RequestContext,
) -> anyhow::Result<BlockNumber> {
let nblocks = if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
};
Ok(nblocks)
}
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {
@@ -1644,7 +1633,10 @@ mod tests {
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
async fn init_walingest_test<'a>(
tline: &'a Timeline,
ctx: &RequestContext,
) -> Result<WalIngest<'a>> {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1689,29 +1681,29 @@ mod tests {
// The relation was created at LSN 2, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
1
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.await?,
3
);
@@ -1719,46 +1711,46 @@ mod tests {
// Check page contents at each LSN
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 2")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 2 at 5")
);
@@ -1774,19 +1766,19 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
@@ -1794,13 +1786,13 @@ mod tests {
// should still see the truncated block with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.await?,
3
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 2 at 5")
);
@@ -1813,7 +1805,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
.await?,
0
);
@@ -1826,19 +1818,19 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
.await?,
ZERO_PAGE
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
.await?,
TEST_IMG("foo blk 1")
);
@@ -1851,21 +1843,21 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
.await?,
1501
);
for blk in 2..1500 {
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
.await?,
ZERO_PAGE
);
}
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
.await?,
TEST_IMG("foo blk 1500")
);
@@ -1892,13 +1884,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
1
);
@@ -1911,7 +1903,7 @@ mod tests {
// Check that rel is not visible anymore
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
.await?,
false
);
@@ -1929,13 +1921,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
.await?,
1
);
@@ -1968,24 +1960,24 @@ mod tests {
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
relsize
);
@@ -1996,7 +1988,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2013,7 +2005,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
.await?,
1
);
@@ -2023,7 +2015,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2032,7 +2024,7 @@ mod tests {
// should still see all blocks with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.await?,
relsize
);
@@ -2041,7 +2033,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2061,13 +2053,13 @@ mod tests {
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
.await?,
relsize
);
@@ -2077,7 +2069,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2110,9 +2102,7 @@ mod tests {
assert_current_logical_size(&tline, Lsn(lsn));
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE + 1
);
@@ -2124,9 +2114,7 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE
);
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2139,9 +2127,7 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE - 1
);
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2157,9 +2143,7 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
size as BlockNumber
);
@@ -2196,7 +2180,7 @@ mod tests {
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
let source_initdb_path = format!("{path}/{INITDB_PATH}");
let startpoint = Lsn::from_hex("14AEC08").unwrap();
let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
let endpoint = Lsn::from_hex("1FFFF98").unwrap();
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
let (tenant, ctx) = harness.load().await;
@@ -2238,7 +2222,7 @@ mod tests {
let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
.await
.unwrap();
let mut modification = tline.begin_modification(startpoint);
let mut modification = tline.begin_modification(endpoint);
let mut decoded = DecodedWALRecord::default();
println!("decoding {} bytes", bytes.len() - xlogoff);
@@ -2252,7 +2236,6 @@ mod tests {
.await
.unwrap();
}
modification.commit(&ctx).await.unwrap();
}
let duration = started_at.elapsed();

View File

@@ -22,7 +22,6 @@ use anyhow::Context;
use byteorder::{ByteOrder, LittleEndian};
use bytes::{BufMut, Bytes, BytesMut};
use nix::poll::*;
use pageserver_api::shard::TenantShardId;
use serde::Serialize;
use std::collections::VecDeque;
use std::io;
@@ -36,11 +35,14 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use std::time::Duration;
use std::time::Instant;
use tracing::*;
use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
#[cfg(feature = "testing")]
use std::sync::atomic::{AtomicUsize, Ordering};
#[cfg(feature = "testing")]
use pageserver_api::shard::TenantShardId;
use crate::config::PageServerConf;
use crate::metrics::{
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -90,7 +92,7 @@ struct ProcessOutput {
/// records.
///
pub struct PostgresRedoManager {
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>,
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -184,13 +186,10 @@ impl PostgresRedoManager {
///
/// Create a new PostgresRedoManager.
///
pub fn new(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
) -> PostgresRedoManager {
pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
// The actual process is launched lazily, on first request.
PostgresRedoManager {
tenant_shard_id,
tenant_id,
conf,
last_redo_at: std::sync::Mutex::default(),
redo_process: RwLock::new(None),
@@ -245,12 +244,8 @@ impl PostgresRedoManager {
let timer =
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
let proc = Arc::new(
WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
.context("launch walredo process")?,
);
timer.observe_duration();
*proc_guard = Some(Arc::clone(&proc));
@@ -643,7 +638,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: Mutex<ProcessOutput>,
@@ -657,10 +652,10 @@ impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
pg_version: u32,
) -> anyhow::Result<Self> {
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -685,7 +680,7 @@ impl WalRedoProcess {
// as close-on-exec by default, but that's not enough, since we use
// libraries that directly call libc open without setting that flag.
.close_fds()
.spawn_no_leak_child(tenant_shard_id)
.spawn_no_leak_child(tenant_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
@@ -746,12 +741,12 @@ impl WalRedoProcess {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
);
Ok(Self {
conf,
tenant_shard_id,
tenant_id,
child: Some(child),
stdin: Mutex::new(ProcessInput {
stdin,
@@ -777,7 +772,7 @@ impl WalRedoProcess {
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
fn apply_wal_records(
&self,
tag: BufferTag,
@@ -971,7 +966,11 @@ impl WalRedoProcess {
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
// TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
let path = self
.conf
.tenant_path(&TenantShardId::unsharded(self.tenant_id))
.join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
@@ -1005,7 +1004,7 @@ impl Drop for WalRedoProcess {
/// Wrapper type around `std::process::Child` which guarantees that the child
/// will be killed and waited-for by this process before being dropped.
struct NoLeakChild {
tenant_id: TenantShardId,
tenant_id: TenantId,
child: Option<Child>,
}
@@ -1024,7 +1023,7 @@ impl DerefMut for NoLeakChild {
}
impl NoLeakChild {
fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
let child = command.spawn()?;
Ok(NoLeakChild {
tenant_id,
@@ -1079,7 +1078,7 @@ impl Drop for NoLeakChild {
Some(child) => child,
None => return,
};
let tenant_shard_id = self.tenant_id;
let tenant_id = self.tenant_id;
// Offload the kill+wait of the child process into the background.
// If someone stops the runtime, we'll leak the child process.
// We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1087,11 +1086,7 @@ impl Drop for NoLeakChild {
tokio::task::spawn_blocking(move || {
// Intentionally don't inherit the tracing context from whoever is dropping us.
// This thread here is going to outlive of our dropper.
let span = tracing::info_span!(
"walredo",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()
);
let span = tracing::info_span!("walredo", %tenant_id);
let _entered = span.enter();
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
})
@@ -1101,11 +1096,11 @@ impl Drop for NoLeakChild {
}
trait NoLeakChildCommandExt {
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
}
impl NoLeakChildCommandExt for Command {
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
NoLeakChild::spawn(tenant_id, self)
}
}
@@ -1160,7 +1155,6 @@ mod tests {
use crate::repository::Key;
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
use bytes::Bytes;
use pageserver_api::shard::TenantShardId;
use std::str::FromStr;
use utils::{id::TenantId, lsn::Lsn};
@@ -1270,9 +1264,9 @@ mod tests {
let repo_dir = camino_tempfile::tempdir()?;
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
let tenant_id = TenantId::generate();
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = PostgresRedoManager::new(conf, tenant_id);
Ok(RedoHarness {
_repo_dir: repo_dir,

View File

@@ -35,8 +35,7 @@
#define PageStoreTrace DEBUG5
#define MIN_RECONNECT_INTERVAL_USEC 1000
#define MAX_RECONNECT_INTERVAL_USEC 1000000
#define RECONNECT_INTERVAL_USEC 1000000
bool connected = false;
PGconn *pageserver_conn = NULL;
@@ -134,11 +133,6 @@ pageserver_connect(int elevel)
const char *values[3];
int n;
static TimestampTz last_connect_time = 0;
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
TimestampTz now;
uint64_t us_since_last_connect;
Assert(!connected);
if (CheckConnstringUpdated())
@@ -146,22 +140,6 @@ pageserver_connect(int elevel)
ReloadConnstring();
}
now = GetCurrentTimestamp();
us_since_last_connect = now - last_connect_time;
if (us_since_last_connect < delay_us)
{
pg_usleep(delay_us - us_since_last_connect);
delay_us *= 2;
if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
delay_us = MAX_RECONNECT_INTERVAL_USEC;
last_connect_time = GetCurrentTimestamp();
}
else
{
delay_us = MIN_RECONNECT_INTERVAL_USEC;
last_connect_time = now;
}
/*
* Connect using the connection string we got from the
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -355,6 +333,7 @@ pageserver_send(NeonRequest *request)
{
HandleMainLoopInterrupts();
n_reconnect_attempts += 1;
pg_usleep(RECONNECT_INTERVAL_USEC);
}
n_reconnect_attempts = 0;
}

View File

@@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
port = strchr(host, ':');
if (port == NULL)
{
wp_log(FATAL, "port is not specified");
walprop_log(FATAL, "port is not specified");
}
*port++ = '\0';
sep = strchr(port, ',');
@@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
*sep++ = '\0';
if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
{
wp_log(FATAL, "too many safekeepers");
walprop_log(FATAL, "Too many safekeepers");
}
wp->safekeeper[wp->n_safekeepers].host = host;
wp->safekeeper[wp->n_safekeepers].port = port;
@@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
if (written > MAXCONNINFO || written < 0)
wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
}
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
@@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
}
if (wp->n_safekeepers < 1)
{
wp_log(FATAL, "safekeepers addresses are not specified");
walprop_log(FATAL, "Safekeepers addresses are not specified");
}
wp->quorum = wp->n_safekeepers / 2 + 1;
@@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
wp->greetRequest.systemId = wp->config->systemId;
if (!wp->config->neon_timeline)
wp_log(FATAL, "neon.timeline_id is not provided");
walprop_log(FATAL, "neon.timeline_id is not provided");
if (*wp->config->neon_timeline != '\0' &&
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
if (!wp->config->neon_tenant)
wp_log(FATAL, "neon.tenant_id is not provided");
walprop_log(FATAL, "neon.tenant_id is not provided");
if (*wp->config->neon_tenant != '\0' &&
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
wp->greetRequest.timeline = wp->config->pgTimeline;
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
@@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp)
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
wp->config->safekeeper_connection_timeout))
{
wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
ShutdownConnection(sk);
}
}
@@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk)
*
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
*/
wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
/*
* Even though the connection failed, we still need to clean up the
@@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk)
* (see libpqrcv_connect, defined in
* src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
*/
wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
sk->state = SS_CONNECTING_WRITE;
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
@@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp)
static void
AdvancePollState(Safekeeper *sk, uint32 events)
{
#ifdef WALPROPOSER_LIB /* wp_log needs wp in lib build */
#ifdef WALPROPOSER_LIB /* walprop_log needs wp in lib build */
WalProposer *wp = sk->wp;
#endif
@@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
* ResetConnection
*/
case SS_OFFLINE:
wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline",
sk->host, sk->port);
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
sk->host, sk->port);
break; /* actually unreachable, but prevents
* -Wimplicit-fallthrough */
@@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
* requests.
*/
case SS_VOTING:
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk));
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk));
ResetConnection(sk);
return;
@@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
* Idle state for waiting votes from quorum.
*/
case SS_IDLE:
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk));
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk));
ResetConnection(sk);
return;
@@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk)
switch (result)
{
case WP_CONN_POLLING_OK:
wp_log(LOG, "connected with node %s:%s", sk->host,
sk->port);
walprop_log(LOG, "connected with node %s:%s", sk->host,
sk->port);
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
/*
@@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk)
break;
case WP_CONN_POLLING_FAILED:
wp_log(WARNING, "failed to connect to node '%s:%s': %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
/*
* If connecting failed, we don't want to restart the connection
@@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk)
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
{
wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return;
}
@@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk)
break;
case WP_EXEC_FAILED:
wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return;
@@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk)
* wrong"
*/
case WP_EXEC_UNEXPECTED_SUCCESS:
wp_log(WARNING, "received bad response from safekeeper %s:%s query execution",
sk->host, sk->port);
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
sk->host, sk->port);
ShutdownConnection(sk);
return;
}
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
return;
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
/* Protocol is all good, move to voting. */
sk->state = SS_VOTING;
@@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
if (wp->n_connected == wp->quorum)
{
wp->propTerm++;
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
wp->voteRequest = (VoteRequest)
{
@@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
else if (sk->greetResponse.term > wp->propTerm)
{
/* Another compute with higher term is running. */
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->greetResponse.term, wp->propTerm);
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->greetResponse.term, wp->propTerm);
}
/*
@@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk)
WalProposer *wp = sk->wp;
/* We have quorum for voting, send our vote request */
wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
/* On failure, logging & resetting is handled */
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
return;
@@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk)
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
return;
wp_log(LOG,
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
walprop_log(LOG,
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
/*
* In case of acceptor rejecting our vote, bail out, but only if either it
@@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk)
if ((!sk->voteResponse.voteGiven) &&
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
{
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->voteResponse.term, wp->propTerm);
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->voteResponse.term, wp->propTerm);
}
Assert(sk->voteResponse.term == wp->propTerm);
@@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp)
*/
if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
{
wp_log(FATAL, "failed to download WAL for logical replicaiton");
walprop_log(FATAL, "failed to download WAL for logical replicaiton");
}
if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
@@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp)
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
{
wp_log(WARNING,
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
LSN_FORMAT_ARGS(wp->timelineStartLsn),
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
walprop_log(WARNING,
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
LSN_FORMAT_ARGS(wp->timelineStartLsn),
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
}
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
}
@@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp)
{
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
}
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
}
/*
@@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp)
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
wp->quorum,
wp->propTerm,
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
LSN_FORMAT_ARGS(wp->truncateLsn));
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
wp->quorum,
wp->propTerm,
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
LSN_FORMAT_ARGS(wp->truncateLsn));
/*
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp)
* scenario.
*/
disable_core_dump();
wp_log(PANIC,
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
walprop_log(PANIC,
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
}
}
walprop_shared->mineLastElectedTerm = wp->propTerm;
@@ -1091,10 +1091,34 @@ SendProposerElected(Safekeeper *sk)
{
/* safekeeper is empty or no common point, start from the beginning */
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
if (sk->startStreamingAt < wp->truncateLsn)
{
/*
* There's a gap between the WAL starting point and a truncateLsn,
* which can't appear in a normal working cluster. That gap means
* that all safekeepers reported that they have persisted WAL up
* to the truncateLsn before, but now current safekeeper tells
* otherwise.
*
* Also we have a special condition here, which is empty
* safekeeper with no history. In combination with a gap, that can
* happen when we introduce a new safekeeper to the cluster. This
* is a rare case, which is triggered manually for now, and should
* be treated with care.
*/
/*
* truncateLsn will not change without ack from current
* safekeeper, and it's aligned to the WAL record, so we can
* safely start streaming from this point.
*/
sk->startStreamingAt = wp->truncateLsn;
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
LSN_FORMAT_ARGS(sk->startStreamingAt));
}
}
else
{
@@ -1117,7 +1141,7 @@ SendProposerElected(Safekeeper *sk)
}
}
Assert(sk->startStreamingAt <= wp->availableLsn);
Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
msg.tag = 'e';
msg.term = wp->propTerm;
@@ -1126,9 +1150,9 @@ SendProposerElected(Safekeeper *sk)
msg.timelineStartLsn = wp->timelineStartLsn;
lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
wp_log(LOG,
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
walprop_log(LOG,
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
resetStringInfo(&sk->outbuf);
pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1237,8 +1261,8 @@ HandleActiveState(Safekeeper *sk, uint32 events)
/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
if (events & WL_SOCKET_CLOSED)
{
wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
sk->host, sk->port);
walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
sk->host, sk->port);
ShutdownConnection(sk);
return;
}
@@ -1299,12 +1323,12 @@ SendAppendRequests(Safekeeper *sk)
req = &sk->appendRequest;
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
req->endLsn - req->beginLsn,
LSN_FORMAT_ARGS(req->beginLsn),
LSN_FORMAT_ARGS(req->endLsn),
LSN_FORMAT_ARGS(req->commitLsn),
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
req->endLsn - req->beginLsn,
LSN_FORMAT_ARGS(req->beginLsn),
LSN_FORMAT_ARGS(req->endLsn),
LSN_FORMAT_ARGS(req->commitLsn),
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
resetStringInfo(&sk->outbuf);
@@ -1331,8 +1355,8 @@ SendAppendRequests(Safekeeper *sk)
case NEON_WALREAD_WOULDBLOCK:
return true;
case NEON_WALREAD_ERROR:
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port, errmsg);
walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port, errmsg);
ShutdownConnection(sk);
return false;
default:
@@ -1364,9 +1388,9 @@ SendAppendRequests(Safekeeper *sk)
return true;
case PG_ASYNC_WRITE_FAIL:
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
default:
@@ -1405,11 +1429,11 @@ RecvAppendResponses(Safekeeper *sk)
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
break;
wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
sk->appendResponse.term,
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
sk->host, sk->port);
walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
sk->appendResponse.term,
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
sk->host, sk->port);
if (sk->appendResponse.term > wp->propTerm)
{
@@ -1419,9 +1443,9 @@ RecvAppendResponses(Safekeeper *sk)
* core as this is kinda expected scenario.
*/
disable_core_dump();
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
sk->host, sk->port,
sk->appendResponse.term, wp->propTerm);
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
sk->host, sk->port,
sk->appendResponse.term, wp->propTerm);
}
readAnything = true;
@@ -1465,32 +1489,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->currentClusterSize = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
rf->currentClusterSize);
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
rf->currentClusterSize);
}
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->last_received_lsn = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
LSN_FORMAT_ARGS(rf->last_received_lsn));
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
LSN_FORMAT_ARGS(rf->last_received_lsn));
}
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
}
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
}
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
{
@@ -1502,8 +1526,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
/* Copy because timestamptz_to_str returns a static buffer */
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
rf->replytime, replyTimeStr);
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
rf->replytime, replyTimeStr);
pfree(replyTimeStr);
}
@@ -1517,7 +1541,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
* Skip unknown keys to support backward compatibile protocol
* changes
*/
wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
pq_getmsgbytes(reply_message, len);
};
}
@@ -1582,7 +1606,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
if (wp->n_votes < wp->quorum)
{
wp_log(WARNING, "GetDonor called before elections are won");
walprop_log(WARNING, "GetDonor called before elections are won");
return NULL;
}
@@ -1710,9 +1734,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
return false;
case PG_ASYNC_READ_FAIL:
wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host,
sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
}
@@ -1750,8 +1774,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
tag = pq_getmsgint64_le(&s);
if (tag != anymsg->tag)
{
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
sk->port, FormatSafekeeperState(sk));
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
sk->port, FormatSafekeeperState(sk));
ResetConnection(sk);
return false;
}
@@ -1827,9 +1851,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
{
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
}
@@ -1880,9 +1904,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
return false;
case PG_ASYNC_WRITE_FAIL:
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
default:
@@ -1919,9 +1943,9 @@ AsyncFlush(Safekeeper *sk)
/* Nothing to do; try again when the socket's ready */
return false;
case -1:
wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk),
wp->api.conn_error_message(sk));
ResetConnection(sk);
return false;
default:
@@ -1950,11 +1974,11 @@ CompareLsn(const void *a, const void *b)
*
* The strings are intended to be used as a prefix to "state", e.g.:
*
* wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
*
* If this sort of phrasing doesn't fit the message, instead use something like:
*
* wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
*/
static char *
FormatSafekeeperState(Safekeeper *sk)
@@ -2035,8 +2059,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
* To give a descriptive message in the case of failure, we use elog
* and then an assertion that's guaranteed to fail.
*/
wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
Assert(events_ok_for_state);
}
}
@@ -2175,8 +2199,8 @@ FormatEvents(WalProposer *wp, uint32 events)
if (events & (~all_flags))
{
wp_log(WARNING, "event formatting found unexpected component %d",
events & (~all_flags));
walprop_log(WARNING, "Event formatting found unexpected component %d",
events & (~all_flags));
return_str[6] = '*';
return_str[7] = '\0';
}

View File

@@ -707,23 +707,11 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
#define WPEVENT 1337 /* special log level for walproposer internal
* events */
#define WP_LOG_PREFIX "[WP] "
/*
* wp_log is used in pure wp code (walproposer.c), allowing API callback to
* catch logging.
*/
#ifdef WALPROPOSER_LIB
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
#else
#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
#endif
/*
* And wpg_log is used all other (postgres specific) walproposer code, just
* adding prefix.
*/
#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
#endif /* __NEON_WALPROPOSER_H__ */

View File

@@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
{
StartReplicationCmd cmd;
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
LSN_FORMAT_ARGS(startpos));
elog(LOG, "WAL proposer starts streaming at %X/%X",
LSN_FORMAT_ARGS(startpos));
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
cmd.timeline = wp->greetRequest.timeline;
cmd.startpoint = startpos;
@@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void)
{
load_file("libpqwalreceiver", false);
if (WalReceiverFunctions == NULL)
wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
}
/* Helper function */
@@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo)
* PGconn structure"
*/
if (!pg_conn)
wpg_log(FATAL, "failed to allocate new PGconn object");
elog(FATAL, "failed to allocate new PGconn object");
/*
* And in theory this allocation can fail as well, but it's incredibly
@@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
* unused. We'll expect it's never returned.
*/
case PGRES_POLLING_ACTIVE:
wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
/*
* This return is never actually reached, but it's here to make
@@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
*/
if (!result)
{
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
return WP_EXEC_UNEXPECTED_SUCCESS;
}
@@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
}
if (unexpected_success)
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
return return_val;
}
@@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
if (status != PGRES_FATAL_ERROR)
wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
/*
* If there was actually an error, it'll be properly reported
@@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
elog(FATAL, "invalid return %d from PQputCopyData", result);
}
/*
@@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
wpg_log(FATAL, "invalid return %d from PQflush", result);
elog(FATAL, "invalid return %d from PQflush", result);
}
}
@@ -1237,6 +1237,19 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
return true; /* recovery not needed */
endpos = wp->propEpochStartLsn;
/*
* If we need to download more than a max_slot_wal_keep_size, cap to it to
* avoid risk of exploding pg_wal. Logical replication won't work until
* recreated, but at least compute would start; this also follows
* max_slot_wal_keep_size semantics.
*/
download_range_mb = (endpos - startpos) / 1024 / 1024;
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
{
startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
}
timeline = wp->greetRequest.timeline;
if (!neon_auth_token)
@@ -1249,7 +1262,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
if (written > MAXCONNINFO || written < 0)
wpg_log(FATAL, "could not append password to the safekeeper connection string");
elog(FATAL, "could not append password to the safekeeper connection string");
}
#if PG_MAJORVERSION_NUM < 16
@@ -1266,11 +1279,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
err)));
return false;
}
wpg_log(LOG,
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
"%d",
sk->host, sk->port, (uint32) (startpos >> 32),
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
elog(LOG,
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
"%d",
sk->host, sk->port, (uint32) (startpos >> 32),
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
options.logical = false;
options.startpoint = startpos;
@@ -1468,11 +1481,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
{
char log_prefix[64];
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
Assert(!sk->xlogreader);
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
if (sk->xlogreader == NULL)
wpg_log(FATAL, "failed to allocate xlog reader");
elog(FATAL, "Failed to allocate xlog reader");
}
static NeonWALReadResult
@@ -1536,7 +1549,7 @@ static void
walprop_pg_init_event_set(WalProposer *wp)
{
if (waitEvents)
wpg_log(FATAL, "double-initialization of event set");
elog(FATAL, "double-initialization of event set");
/* for each sk, we have socket plus potentially socket for neon walreader */
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
@@ -1568,7 +1581,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
Assert(sk->nwrEventPos == -1);
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
}
static void
@@ -1667,8 +1680,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
{
WalProposer *wp = to_remove->wp;
wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
to_remove->host, to_remove->port, is_sk);
elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
to_remove->host, to_remove->port, is_sk);
/*
* Shortpath for exiting if have nothing to do. We never call this
@@ -1822,13 +1835,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
rf->currentClusterSize,
LSN_FORMAT_ARGS(rf->last_received_lsn),
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
rf->replytime);
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
rf->currentClusterSize,
LSN_FORMAT_ARGS(rf->last_received_lsn),
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
rf->replytime);
}
/*
@@ -1974,7 +1987,7 @@ GetLogRepRestartLSN(WalProposer *wp)
{
uint64 download_range_mb;
wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
/*
* If we need to download more than a max_slot_wal_keep_size,
@@ -1986,8 +1999,8 @@ GetLogRepRestartLSN(WalProposer *wp)
download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
{
wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
return InvalidXLogRecPtr;
}

392
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -158,28 +158,6 @@ files = [
attrs = ">=16.0.0"
pluggy = ">=0.4.0"
[[package]]
name = "anyio"
version = "4.2.0"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false
python-versions = ">=3.8"
files = [
{file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
{file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
]
[package.dependencies]
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
idna = ">=2.8"
sniffio = ">=1.1"
typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
[package.extras]
doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
trio = ["trio (>=0.23)"]
[[package]]
name = "async-timeout"
version = "4.0.3"
@@ -311,20 +289,69 @@ files = [
]
[[package]]
name = "boto3"
version = "1.34.11"
description = "The AWS SDK for Python"
name = "black"
version = "23.3.0"
description = "The uncompromising code formatter."
optional = false
python-versions = ">= 3.8"
python-versions = ">=3.7"
files = [
{file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
{file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
{file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
{file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
{file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
{file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
{file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
{file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
{file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
{file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
{file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
{file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
{file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
{file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
{file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
{file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
{file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
{file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
{file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
{file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
{file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
{file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
{file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
{file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
{file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
{file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
{file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
]
[package.dependencies]
botocore = ">=1.34.11,<1.35.0"
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "boto3"
version = "1.26.16"
description = "The AWS SDK for Python"
optional = false
python-versions = ">= 3.7"
files = [
{file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
{file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
]
[package.dependencies]
botocore = ">=1.29.16,<1.30.0"
jmespath = ">=0.7.1,<2.0.0"
s3transfer = ">=0.10.0,<0.11.0"
s3transfer = ">=0.6.0,<0.7.0"
[package.extras]
crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
@@ -675,25 +702,22 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]
[[package]]
name = "botocore"
version = "1.34.11"
version = "1.29.16"
description = "Low-level, data-driven core of boto 3."
optional = false
python-versions = ">= 3.8"
python-versions = ">= 3.7"
files = [
{file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
{file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
{file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
{file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
]
[package.dependencies]
jmespath = ">=0.7.1,<2.0.0"
python-dateutil = ">=2.1,<3.0.0"
urllib3 = [
{version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
{version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
]
urllib3 = ">=1.25.4,<1.27"
[package.extras]
crt = ["awscrt (==0.19.19)"]
crt = ["awscrt (==0.14.0)"]
[[package]]
name = "botocore-stubs"
@@ -1086,100 +1110,6 @@ files = [
{file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
]
[[package]]
name = "h11"
version = "0.14.0"
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
optional = false
python-versions = ">=3.7"
files = [
{file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
]
[[package]]
name = "h2"
version = "4.1.0"
description = "HTTP/2 State-Machine based protocol implementation"
optional = false
python-versions = ">=3.6.1"
files = [
{file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
{file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
]
[package.dependencies]
hpack = ">=4.0,<5"
hyperframe = ">=6.0,<7"
[[package]]
name = "hpack"
version = "4.0.0"
description = "Pure-Python HPACK header compression"
optional = false
python-versions = ">=3.6.1"
files = [
{file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
{file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
]
[[package]]
name = "httpcore"
version = "1.0.2"
description = "A minimal low-level HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpcore-1.0.2-py3-none-any.whl", hash = "sha256:096cc05bca73b8e459a1fc3dcf585148f63e534eae4339559c9b8a8d6399acc7"},
{file = "httpcore-1.0.2.tar.gz", hash = "sha256:9fc092e4799b26174648e54b74ed5f683132a464e95643b226e00c2ed2fa6535"},
]
[package.dependencies]
certifi = "*"
h11 = ">=0.13,<0.15"
[package.extras]
asyncio = ["anyio (>=4.0,<5.0)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
trio = ["trio (>=0.22.0,<0.23.0)"]
[[package]]
name = "httpx"
version = "0.26.0"
description = "The next generation HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
{file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
]
[package.dependencies]
anyio = "*"
certifi = "*"
h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
httpcore = "==1.*"
idna = "*"
sniffio = "*"
[package.extras]
brotli = ["brotli", "brotlicffi"]
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
[[package]]
name = "hyperframe"
version = "6.0.1"
description = "HTTP/2 framing layer for Python"
optional = false
python-versions = ">=3.6.1"
files = [
{file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
{file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
]
[[package]]
name = "idna"
version = "3.3"
@@ -1234,13 +1164,13 @@ files = [
[[package]]
name = "jinja2"
version = "3.1.3"
version = "3.1.2"
description = "A very fast and expressive template engine."
optional = false
python-versions = ">=3.7"
files = [
{file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
{file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
{file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
{file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
]
[package.dependencies]
@@ -1694,6 +1624,17 @@ files = [
{file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
]
[[package]]
name = "pathspec"
version = "0.9.0"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
files = [
{file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
{file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
]
[[package]]
name = "pbr"
version = "5.9.0"
@@ -1705,6 +1646,21 @@ files = [
{file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
]
[[package]]
name = "platformdirs"
version = "2.5.2"
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
optional = false
python-versions = ">=3.7"
files = [
{file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
{file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
]
[package.extras]
docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
[[package]]
name = "pluggy"
version = "1.0.0"
@@ -1933,13 +1889,13 @@ files = [
[[package]]
name = "pytest"
version = "7.4.4"
version = "7.3.1"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
{file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
{file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
{file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
]
[package.dependencies]
@@ -1951,7 +1907,7 @@ pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
[[package]]
name = "pytest-asyncio"
@@ -2248,46 +2204,46 @@ pyasn1 = ">=0.1.3"
[[package]]
name = "ruff"
version = "0.1.11"
description = "An extremely fast Python linter and code formatter, written in Rust."
version = "0.0.269"
description = "An extremely fast Python linter, written in Rust."
optional = false
python-versions = ">=3.7"
files = [
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
{file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
{file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
{file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
{file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
{file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
{file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
{file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
{file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
{file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
{file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
]
[[package]]
name = "s3transfer"
version = "0.10.0"
version = "0.6.0"
description = "An Amazon S3 Transfer Manager"
optional = false
python-versions = ">= 3.8"
python-versions = ">= 3.7"
files = [
{file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
{file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
{file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
{file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
]
[package.dependencies]
botocore = ">=1.33.2,<2.0a.0"
botocore = ">=1.12.36,<2.0a.0"
[package.extras]
crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
[[package]]
name = "sarif-om"
@@ -2331,17 +2287,6 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
[[package]]
name = "sniffio"
version = "1.3.0"
description = "Sniff out which async library your code is running under"
optional = false
python-versions = ">=3.7"
files = [
{file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
]
[[package]]
name = "sshpubkeys"
version = "3.3.1"
@@ -2505,87 +2450,6 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
optional = ["python-socks", "wsaccel"]
test = ["websockets"]
[[package]]
name = "websockets"
version = "12.0"
description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
optional = false
python-versions = ">=3.8"
files = [
{file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
{file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
{file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
{file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
{file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
{file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
{file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
{file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
{file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
{file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
{file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
{file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
{file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
{file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
{file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
{file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
{file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
{file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
{file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
{file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
{file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
{file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
{file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
{file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
{file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
{file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
{file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
{file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
{file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
{file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
{file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
{file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
{file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
{file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
{file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
{file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
{file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
{file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
{file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
{file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
{file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
{file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
{file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
{file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
{file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
{file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
{file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
{file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
]
[[package]]
name = "werkzeug"
version = "3.0.1"
@@ -2629,6 +2493,16 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2866,4 +2740,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "f750bd06f1937f0614204e0ffe9a293eb61a0d7d675a80d5849f40a22745b5f9"
content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"

View File

@@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
return cmd
def ruff_check(fix_inplace: bool) -> str:
cmd = "poetry run ruff check"
if fix_inplace:
cmd += " --fix"
def black(fix_inplace: bool) -> str:
cmd = "poetry run black"
if not fix_inplace:
cmd += " --diff --check"
return cmd
def ruff_format(fix_inplace: bool) -> str:
cmd = "poetry run ruff format"
if not fix_inplace:
cmd += " --diff --check"
def ruff(fix_inplace: bool) -> str:
cmd = "poetry run ruff"
if fix_inplace:
cmd += " --fix"
return cmd
@@ -109,16 +109,16 @@ if __name__ == "__main__":
no_color=args.no_color,
)
check(
name="ruff check",
name="black",
suffix=".py",
cmd=ruff_check(fix_inplace=args.fix_inplace),
cmd=black(fix_inplace=args.fix_inplace),
changed_files=files,
no_color=args.no_color,
)
check(
name="ruff format",
name="ruff",
suffix=".py",
cmd=ruff_format(fix_inplace=args.fix_inplace),
cmd=ruff(fix_inplace=args.fix_inplace),
changed_files=files,
no_color=args.no_color,
)

View File

@@ -5,7 +5,7 @@ edition.workspace = true
license.workspace = true
[features]
default = ["testing"]
default = []
testing = []
[dependencies]
@@ -14,7 +14,6 @@ async-trait.workspace = true
base64.workspace = true
bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
camino.workspace = true
chrono.workspace = true
clap.workspace = true
consumption_metrics.workspace = true
@@ -27,6 +26,7 @@ hex.workspace = true
hmac.workspace = true
hostname.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
ipnet.workspace = true
itertools.workspace = true
@@ -35,8 +35,6 @@ metrics.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
parking_lot.workspace = true
parquet.workspace = true
parquet_derive.workspace = true
pbkdf2 = { workspace = true, features = ["simple", "std"] }
pin-project-lite.workspace = true
postgres_backend.workspace = true
@@ -44,7 +42,6 @@ pq_proto.workspace = true
prometheus.workspace = true
rand.workspace = true
regex.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
reqwest = { workspace = true, features = ["json"] }
reqwest-middleware.workspace = true
reqwest-retry.workspace = true
@@ -65,13 +62,11 @@ tls-listener.workspace = true
tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio-tungstenite.workspace = true
tokio = { workspace = true, features = ["signal"] }
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
tracing.workspace = true
tungstenite.workspace = true
url.workspace = true
utils.workspace = true
uuid.workspace = true
@@ -80,13 +75,11 @@ x509-parser.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres-protocol.workspace = true
redis.workspace = true
smol_str.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
camino-tempfile.workspace = true
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true

View File

@@ -4,7 +4,7 @@ pub mod backend;
pub use backend::BackendType;
mod credentials;
pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint};
pub use credentials::{check_peer_addr_is_in_list, ClientCredentials};
mod password_hack;
pub use password_hack::parse_endpoint_param;

View File

@@ -8,27 +8,26 @@ use tokio_postgres::config::AuthKeys;
use crate::auth::credentials::check_peer_addr_is_in_list;
use crate::auth::validate_password_and_exchange;
use crate::cache::Cached;
use crate::console::errors::GetAuthInfoError;
use crate::console::AuthSecret;
use crate::context::RequestMonitoring;
use crate::proxy::connect_compute::handle_try_wake;
use crate::proxy::retry::retry_after;
use crate::proxy::NeonOptions;
use crate::scram;
use crate::stream::Stream;
use crate::{
auth::{self, ComputeUserInfoMaybeEndpoint},
auth::{self, ClientCredentials},
config::AuthenticationConfig,
console::{
self,
provider::{CachedAllowedIps, CachedNodeInfo},
provider::{CachedNodeInfo, ConsoleReqExtra},
Api,
},
metrics::LatencyTimer,
stream, url,
};
use futures::TryFutureExt;
use std::borrow::Cow;
use std::net::IpAddr;
use std::ops::ControlFlow;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
@@ -39,7 +38,7 @@ use tracing::{error, info, warn};
/// * When `T` is `()`, it's just a regular auth backend selector
/// which we use in [`crate::config::ProxyConfig`].
///
/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
/// * However, when we substitute `T` with [`ClientCredentials`],
/// this helps us provide the credentials only to those auth
/// backends which require them for the authentication process.
pub enum BackendType<'a, T> {
@@ -57,7 +56,7 @@ pub enum BackendType<'a, T> {
pub trait TestBackend: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError>;
fn get_allowed_ips(&self) -> Result<Arc<Vec<String>>, console::errors::GetAuthInfoError>;
}
impl std::fmt::Display for BackendType<'_, ()> {
@@ -128,23 +127,15 @@ pub struct ComputeCredentials<T> {
pub keys: T,
}
#[derive(Debug, Clone)]
pub struct ComputeUserInfoNoEndpoint {
pub user: SmolStr,
pub options: NeonOptions,
pub peer_addr: IpAddr,
pub cache_key: SmolStr,
}
#[derive(Debug, Clone)]
pub struct ComputeUserInfo {
pub endpoint: SmolStr,
pub user: SmolStr,
pub options: NeonOptions,
}
impl ComputeUserInfo {
pub fn endpoint_cache_key(&self) -> SmolStr {
self.options.get_cache_key(&self.endpoint)
}
pub inner: ComputeUserInfoNoEndpoint,
}
pub enum ComputeCredentialKeys {
@@ -153,21 +144,19 @@ pub enum ComputeCredentialKeys {
AuthKeys(AuthKeys),
}
impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
impl TryFrom<ClientCredentials> for ComputeUserInfo {
// user name
type Error = ComputeUserInfoNoEndpoint;
fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
match user_info.project {
None => Err(ComputeUserInfoNoEndpoint {
user: user_info.user,
options: user_info.options,
}),
Some(endpoint) => Ok(ComputeUserInfo {
endpoint,
user: user_info.user,
options: user_info.options,
}),
fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
let inner = ComputeUserInfoNoEndpoint {
user: creds.user,
peer_addr: creds.peer_addr,
cache_key: creds.cache_key,
};
match creds.project {
None => Err(inner),
Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
}
}
}
@@ -177,53 +166,49 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
///
/// All authentication flows will emit an AuthenticationOk message if successful.
async fn auth_quirks(
ctx: &mut RequestMonitoring,
api: &impl console::Api,
user_info: ComputeUserInfoMaybeEndpoint,
extra: &ConsoleReqExtra,
creds: ClientCredentials,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the endpoint (project) name.
// We now expect to see a very specific payload in the place of password.
let (info, unauthenticated_password) = match user_info.try_into() {
let (info, unauthenticated_password) = match creds.try_into() {
Err(info) => {
let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
.await?;
ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
(res.info, Some(res.keys))
}
Ok(info) => (info, None),
};
info!("fetching user's authentication info");
let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
let allowed_ips = api.get_allowed_ips(extra, &info).await?;
// check allowed list
if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed());
}
let maybe_secret = api.get_role_secret(ctx, &info).await?;
let cached_secret = api.get_role_secret(extra, &info).await?;
let cached_secret = maybe_secret.unwrap_or_else(|| {
let secret = cached_secret.clone().unwrap_or_else(|| {
// If we don't have an authentication secret, we mock one to
// prevent malicious probing (possible due to missing protocol steps).
// This mocked secret will never lead to successful authentication.
info!("authentication info not found, mocking it");
Cached::new_uncached(AuthSecret::Scram(scram::ServerSecret::mock(
&info.user,
rand::random(),
)))
AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
});
match authenticate_with_secret(
ctx,
cached_secret.value.clone(),
secret,
info,
client,
unauthenticated_password,
allow_cleartext,
config,
latency_timer,
)
.await
{
@@ -239,13 +224,13 @@ async fn auth_quirks(
}
async fn authenticate_with_secret(
ctx: &mut RequestMonitoring,
secret: AuthSecret,
info: ComputeUserInfo,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
unauthenticated_password: Option<Vec<u8>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
if let Some(password) = unauthenticated_password {
let auth_outcome = validate_password_and_exchange(&password, secret)?;
@@ -253,7 +238,7 @@ async fn authenticate_with_secret(
crate::sasl::Outcome::Success(key) => key,
crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*info.user));
return Err(auth::AuthError::auth_failed(&*info.inner.user));
}
};
@@ -268,29 +253,38 @@ async fn authenticate_with_secret(
// Perform cleartext auth if we're allowed to do that.
// Currently, we use it for websocket connections (latency).
if allow_cleartext {
return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
}
// Finally, proceed with the main auth flow (SCRAM-based).
classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
classic::authenticate(info, client, config, latency_timer, secret).await
}
/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
/// only if authentication was successfuly.
async fn auth_and_wake_compute(
ctx: &mut RequestMonitoring,
api: &impl console::Api,
user_info: ComputeUserInfoMaybeEndpoint,
extra: &ConsoleReqExtra,
creds: ClientCredentials,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
let compute_credentials =
auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
let compute_credentials = auth_quirks(
api,
extra,
creds,
client,
allow_cleartext,
config,
latency_timer,
)
.await?;
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
match handle_try_wake(wake_res, num_retries) {
Err(e) => {
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -307,8 +301,6 @@ async fn auth_and_wake_compute(
tokio::time::sleep(wait_duration).await;
};
ctx.set_project(node.aux.clone());
match compute_credentials.keys {
#[cfg(feature = "testing")]
ComputeCredentialKeys::Password(password) => node.config.password(password),
@@ -318,15 +310,15 @@ async fn auth_and_wake_compute(
Ok((node, compute_credentials.info))
}
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
impl<'a> BackendType<'a, ClientCredentials> {
/// Get compute endpoint name from the credentials.
pub fn get_endpoint(&self) -> Option<SmolStr> {
use BackendType::*;
match self {
Console(_, user_info) => user_info.project.clone(),
Console(_, creds) => creds.project.clone(),
#[cfg(feature = "testing")]
Postgres(_, user_info) => user_info.project.clone(),
Postgres(_, creds) => creds.project.clone(),
Link(_) => Some("link".into()),
#[cfg(test)]
Test(_) => Some("test".into()),
@@ -338,9 +330,9 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
use BackendType::*;
match self {
Console(_, user_info) => &user_info.user,
Console(_, creds) => &creds.user,
#[cfg(feature = "testing")]
Postgres(_, user_info) => &user_info.user,
Postgres(_, creds) => &creds.user,
Link(_) => "link",
#[cfg(test)]
Test(_) => "test",
@@ -351,37 +343,52 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate(
self,
ctx: &mut RequestMonitoring,
extra: &ConsoleReqExtra,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
use BackendType::*;
let res = match self {
Console(api, user_info) => {
Console(api, creds) => {
info!(
user = &*user_info.user,
project = user_info.project(),
user = &*creds.user,
project = creds.project(),
"performing authentication using the console"
);
let (cache_info, user_info) =
auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
.await?;
let (cache_info, user_info) = auth_and_wake_compute(
&*api,
extra,
creds,
client,
allow_cleartext,
config,
latency_timer,
)
.await?;
(cache_info, BackendType::Console(api, user_info))
}
#[cfg(feature = "testing")]
Postgres(api, user_info) => {
Postgres(api, creds) => {
info!(
user = &*user_info.user,
project = user_info.project(),
user = &*creds.user,
project = creds.project(),
"performing authentication using a local postgres instance"
);
let (cache_info, user_info) =
auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
.await?;
let (cache_info, user_info) = auth_and_wake_compute(
&*api,
extra,
creds,
client,
allow_cleartext,
config,
latency_timer,
)
.await?;
(cache_info, BackendType::Postgres(api, user_info))
}
// NOTE: this auth backend doesn't use client credentials.
@@ -409,16 +416,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
impl BackendType<'_, ComputeUserInfo> {
pub async fn get_allowed_ips(
&self,
ctx: &mut RequestMonitoring,
) -> Result<CachedAllowedIps, GetAuthInfoError> {
extra: &ConsoleReqExtra,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
use BackendType::*;
match self {
Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
Console(api, creds) => api.get_allowed_ips(extra, creds).await,
#[cfg(feature = "testing")]
Postgres(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
Link(_) => Ok(Arc::new(vec![])),
#[cfg(test)]
Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))),
Test(x) => x.get_allowed_ips(),
}
}
@@ -426,14 +433,14 @@ impl BackendType<'_, ComputeUserInfo> {
/// The link auth flow doesn't support this, so we return [`None`] in that case.
pub async fn wake_compute(
&self,
ctx: &mut RequestMonitoring,
extra: &ConsoleReqExtra,
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
use BackendType::*;
match self {
Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
#[cfg(feature = "testing")]
Postgres(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
Link(_) => Ok(None),
#[cfg(test)]
Test(x) => x.wake_compute().map(Some),

View File

@@ -54,7 +54,7 @@ pub(super) async fn authenticate(
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*creds.user));
return Err(auth::AuthError::auth_failed(&*creds.inner.user));
}
};

View File

@@ -36,7 +36,7 @@ pub async fn authenticate_cleartext(
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*info.user));
return Err(auth::AuthError::auth_failed(&*info.inner.user));
}
};
@@ -67,8 +67,7 @@ pub async fn password_hack_no_authentication(
// Report tentative success; compute node will check the password anyway.
Ok(ComputeCredentials {
info: ComputeUserInfo {
user: info.user,
options: info.options,
inner: info,
endpoint: payload.endpoint,
},
keys: payload.password,

View File

@@ -1,8 +1,8 @@
//! User credentials used in authentication.
use crate::{
auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
auth::password_hack::parse_endpoint_param, error::UserFacingError,
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
@@ -12,7 +12,7 @@ use thiserror::Error;
use tracing::{info, warn};
#[derive(Debug, Error, PartialEq, Eq, Clone)]
pub enum ComputeUserInfoParseError {
pub enum ClientCredsParseError {
#[error("Parameter '{0}' is missing in startup packet.")]
MissingKey(&'static str),
@@ -33,58 +33,39 @@ pub enum ComputeUserInfoParseError {
MalformedProjectName(SmolStr),
}
impl UserFacingError for ComputeUserInfoParseError {}
impl UserFacingError for ClientCredsParseError {}
/// Various client credentials which we use for authentication.
/// Note that we don't store any kind of client key or password here.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ComputeUserInfoMaybeEndpoint {
pub struct ClientCredentials {
pub user: SmolStr,
// TODO: this is a severe misnomer! We should think of a new name ASAP.
pub project: Option<SmolStr>,
pub options: NeonOptions,
pub cache_key: SmolStr,
pub peer_addr: IpAddr,
}
impl ComputeUserInfoMaybeEndpoint {
impl ClientCredentials {
#[inline]
pub fn project(&self) -> Option<&str> {
self.project.as_deref()
}
}
pub fn endpoint_sni<'a>(
sni: &'a str,
common_names: &HashSet<String>,
) -> Result<&'a str, ComputeUserInfoParseError> {
let Some((subdomain, common_name)) = sni.split_once('.') else {
return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
};
if !common_names.contains(common_name) {
return Err(ComputeUserInfoParseError::UnknownCommonName {
cn: common_name.into(),
});
}
Ok(subdomain)
}
impl ComputeUserInfoMaybeEndpoint {
impl ClientCredentials {
pub fn parse(
ctx: &mut RequestMonitoring,
params: &StartupMessageParams,
sni: Option<&str>,
common_names: Option<&HashSet<String>>,
) -> Result<Self, ComputeUserInfoParseError> {
use ComputeUserInfoParseError::*;
common_names: Option<HashSet<String>>,
peer_addr: IpAddr,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
// Some parameters are stored in the startup message.
let get_param = |key| params.get(key).ok_or(MissingKey(key));
let user: SmolStr = get_param("user")?.into();
// record the values if we have them
ctx.set_application(params.get("application_name").map(SmolStr::from));
ctx.set_user(user.clone());
ctx.set_endpoint_id(sni.map(SmolStr::from));
let user = get_param("user")?.into();
// Project name might be passed via PG's command-line options.
let project_option = params
@@ -102,7 +83,21 @@ impl ComputeUserInfoMaybeEndpoint {
let project_from_domain = if let Some(sni_str) = sni {
if let Some(cn) = common_names {
Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain);
let project = common_name_from_sni
.and_then(|domain| {
if cn.contains(domain) {
subdomain_from_sni(sni_str, domain)
} else {
None
}
})
.ok_or_else(|| UnknownCommonName {
cn: common_name_from_sni.unwrap_or("").into(),
})?;
Some(project)
} else {
None
}
@@ -141,17 +136,23 @@ impl ComputeUserInfoMaybeEndpoint {
info!("Connection with password hack");
}
let options = NeonOptions::parse_params(params);
let cache_key = format!(
"{}{}",
project.as_deref().unwrap_or(""),
neon_options_str(params)
)
.into();
Ok(Self {
user,
project,
options,
cache_key,
peer_addr,
})
}
}
pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<SmolStr>) -> bool {
pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<String>) -> bool {
if ip_list.is_empty() {
return true;
}
@@ -203,19 +204,25 @@ fn project_name_valid(name: &str) -> bool {
name.chars().all(|c| c.is_alphanumeric() || c == '-')
}
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
sni.strip_suffix(common_name)?
.strip_suffix('.')
.map(SmolStr::from)
}
#[cfg(test)]
mod tests {
use super::*;
use ComputeUserInfoParseError::*;
use ClientCredsParseError::*;
#[test]
fn parse_bare_minimum() -> anyhow::Result<()> {
// According to postgresql, only `user` should be required.
let options = StartupMessageParams::new([("user", "john_doe")]);
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.project, None);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
Ok(())
}
@@ -227,10 +234,10 @@ mod tests {
("database", "world"), // should be ignored
("foo", "bar"), // should be ignored
]);
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.project, None);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
Ok(())
}
@@ -242,12 +249,11 @@ mod tests {
let sni = Some("foo.localhost");
let common_names = Some(["localhost".into()].into());
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.project.as_deref(), Some("foo"));
assert_eq!(user_info.options.get_cache_key("foo"), "foo");
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("foo"));
assert_eq!(creds.cache_key, "foo");
Ok(())
}
@@ -259,10 +265,10 @@ mod tests {
("options", "-ckey=1 project=bar -c geqo=off"),
]);
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.project.as_deref(), Some("bar"));
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
Ok(())
}
@@ -274,10 +280,10 @@ mod tests {
("options", "-ckey=1 endpoint=bar -c geqo=off"),
]);
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.project.as_deref(), Some("bar"));
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
Ok(())
}
@@ -292,10 +298,10 @@ mod tests {
),
]);
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert!(user_info.project.is_none());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
Ok(())
}
@@ -307,10 +313,10 @@ mod tests {
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
]);
let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe");
assert!(user_info.project.is_none());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
Ok(())
}
@@ -322,11 +328,10 @@ mod tests {
let sni = Some("baz.localhost");
let common_names = Some(["localhost".into()].into());
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.project.as_deref(), Some("baz"));
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("baz"));
Ok(())
}
@@ -337,17 +342,15 @@ mod tests {
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.a.com");
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.project.as_deref(), Some("p1"));
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.b.com");
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.project.as_deref(), Some("p1"));
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
Ok(())
}
@@ -360,10 +363,9 @@ mod tests {
let sni = Some("second.localhost");
let common_names = Some(["localhost".into()].into());
let mut ctx = RequestMonitoring::test();
let err =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
.expect_err("should fail");
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
.expect_err("should fail");
match err {
InconsistentProjectNames { domain, option } => {
assert_eq!(option, "first");
@@ -380,10 +382,9 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["example.com".into()].into());
let mut ctx = RequestMonitoring::test();
let err =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
.expect_err("should fail");
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
.expect_err("should fail");
match err {
UnknownCommonName { cn } => {
assert_eq!(cn, "localhost");
@@ -401,14 +402,10 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["localhost".into()].into());
let mut ctx = RequestMonitoring::test();
let user_info =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.project.as_deref(), Some("project"));
assert_eq!(
user_info.options.get_cache_key("project"),
"project endpoint_type:read_write lsn:0/2"
);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.project.as_deref(), Some("project"));
assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");
Ok(())
}

View File

@@ -8,7 +8,6 @@ use std::{net::SocketAddr, sync::Arc};
use futures::future::Either;
use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use proxy::context::RequestMonitoring;
use proxy::proxy::run_until_cancelled;
use tokio::net::TcpListener;
@@ -171,16 +170,7 @@ async fn task_main(
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
let mut ctx =
RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(
&mut ctx,
dest_suffix,
tls_config,
tls_server_end_point,
socket,
)
.await
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
@@ -246,7 +236,6 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
async fn handle_client(
ctx: &mut RequestMonitoring,
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,
@@ -272,5 +261,5 @@ async fn handle_client(
let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = Default::default();
proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
}

View File

@@ -3,14 +3,14 @@ use proxy::auth;
use proxy::config::AuthenticationConfig;
use proxy::config::CacheOptions;
use proxy::config::HttpConfig;
use proxy::config::ProjectInfoCacheOptions;
use proxy::console;
use proxy::context::parquet::ParquetUploadArgs;
use proxy::console::provider::AllowedIpsCache;
use proxy::console::provider::NodeInfoCache;
use proxy::console::provider::RoleSecretCache;
use proxy::http;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::redis::notifications;
use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
@@ -44,9 +44,6 @@ enum AuthBackend {
#[derive(Parser)]
#[command(version = GIT_VERSION, about)]
struct ProxyCliArgs {
/// Name of the region this proxy is deployed in
#[clap(long, default_value_t = String::new())]
region: String,
/// listen for incoming client connections on ip:port
#[clap(short, long, default_value = "127.0.0.1:4432")]
proxy: String,
@@ -136,15 +133,6 @@ struct ProxyCliArgs {
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_ip_check_for_http: bool,
/// redis url for notifications.
#[clap(long)]
redis_notifications: Option<String>,
/// cache for `project_info` (use `size=0` to disable)
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
project_info_cache: String,
#[clap(flatten)]
parquet_upload: ParquetUploadArgs,
}
#[derive(clap::Args, Clone, Copy, Debug)]
@@ -233,11 +221,6 @@ async fn main() -> anyhow::Result<()> {
));
}
client_tasks.spawn(proxy::context::parquet::worker(
cancellation_token.clone(),
args.parquet_upload,
));
// maintenance tasks. these never return unless there's an error
let mut maintenance_tasks = JoinSet::new();
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
@@ -248,15 +231,6 @@ async fn main() -> anyhow::Result<()> {
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
}
if let auth::BackendType::Console(api, _) = &config.auth_backend {
let cache = api.caches.project_info.clone();
if let Some(url) = args.redis_notifications {
info!("Starting redis notifications listener ({url})");
maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
}
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
}
let maintenance = loop {
// get one complete task
match futures::future::select(
@@ -322,17 +296,32 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let auth_backend = match &args.auth_backend {
AuthBackend::Console => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions =
args.project_info_cache.parse()?;
let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;
let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?;
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
info!(
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
);
let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
wake_compute_cache_config,
project_info_cache_config,
)));
info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}");
let caches = Box::leak(Box::new(console::caches::ApiCaches {
node_info: NodeInfoCache::new(
"node_info_cache",
wake_compute_cache_config.size,
wake_compute_cache_config.ttl,
true,
),
allowed_ips: AllowedIpsCache::new(
"allowed_ips_cache",
allowed_ips_cache_config.size,
allowed_ips_cache_config.ttl,
false,
),
role_secret: RoleSecretCache::new(
"role_secret_cache",
role_secret_cache_config.size,
role_secret_cache_config.ttl,
false,
),
}));
let config::WakeComputeLockOptions {
shards,
@@ -391,8 +380,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http,
endpoint_rps_limit,
// TODO: add this argument
region: args.region.clone(),
}));
Ok(config)

View File

@@ -1,6 +1,311 @@
pub mod common;
pub mod project_info;
mod timed_lru;
use std::{
borrow::Borrow,
hash::Hash,
ops::{Deref, DerefMut},
time::{Duration, Instant},
};
use tracing::debug;
// This seems to make more sense than `lru` or `cached`:
//
// * `near/nearcore` ditched `cached` in favor of `lru`
// (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed).
//
// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs).
// This severely hinders its usage both in terms of creating wrappers and supported key types.
//
// On the other hand, `hashlink` has good download stats and appears to be maintained.
use hashlink::{linked_hash_map::RawEntryMut, LruCache};
/// A generic trait which exposes types of cache's key and value,
/// as well as the notion of cache entry invalidation.
/// This is useful for [`timed_lru::Cached`].
pub trait Cache {
/// Entry's key.
type Key;
/// Entry's value.
type Value;
/// Used for entry invalidation.
type LookupInfo<Key>;
/// Invalidate an entry using a lookup info.
/// We don't have an empty default impl because it's error-prone.
fn invalidate(&self, _: &Self::LookupInfo<Self::Key>);
}
impl<C: Cache> Cache for &C {
type Key = C::Key;
type Value = C::Value;
type LookupInfo<Key> = C::LookupInfo<Key>;
fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
C::invalidate(self, info)
}
}
pub use common::{Cache, Cached};
pub use timed_lru::TimedLru;
pub mod timed_lru {
use super::*;
/// An implementation of timed LRU cache with fixed capacity.
/// Key properties:
///
/// * Whenever a new entry is inserted, the least recently accessed one is evicted.
/// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`).
///
/// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp.
/// If the entry has expired, we remove it from the cache; Otherwise we bump the
/// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong
/// its existence.
///
/// * There's an API for immediate invalidation (removal) of a cache entry;
/// It's useful in case we know for sure that the entry is no longer correct.
/// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information.
///
/// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
/// or by a successful lookup (i.e. the entry hasn't expired yet).
/// There is no background job to reap the expired records.
///
/// * It's possible for an entry that has not yet expired entry to be evicted
/// before expired items. That's a bit wasteful, but probably fine in practice.
pub struct TimedLru<K, V> {
/// Cache's name for tracing.
name: &'static str,
/// The underlying cache implementation.
cache: parking_lot::Mutex<LruCache<K, Entry<V>>>,
/// Default time-to-live of a single entry.
ttl: Duration,
update_ttl_on_retrieval: bool,
}
impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
type Key = K;
type Value = V;
type LookupInfo<Key> = LookupInfo<Key>;
fn invalidate(&self, info: &Self::LookupInfo<K>) {
self.invalidate_raw(info)
}
}
struct Entry<T> {
created_at: Instant,
expires_at: Instant,
value: T,
}
impl<K: Hash + Eq, V> TimedLru<K, V> {
/// Construct a new LRU cache with timed entries.
pub fn new(
name: &'static str,
capacity: usize,
ttl: Duration,
update_ttl_on_retrieval: bool,
) -> Self {
Self {
name,
cache: LruCache::new(capacity).into(),
ttl,
update_ttl_on_retrieval,
}
}
/// Drop an entry from the cache if it's outdated.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn invalidate_raw(&self, info: &LookupInfo<K>) {
let now = Instant::now();
// Do costly things before taking the lock.
let mut cache = self.cache.lock();
let raw_entry = match cache.raw_entry_mut().from_key(&info.key) {
RawEntryMut::Vacant(_) => return,
RawEntryMut::Occupied(x) => x,
};
// Remove the entry if it was created prior to lookup timestamp.
let entry = raw_entry.get();
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
let should_remove = created_at <= info.created_at || expires_at <= now;
if should_remove {
raw_entry.remove();
}
drop(cache); // drop lock before logging
debug!(
created_at = format_args!("{created_at:?}"),
expires_at = format_args!("{expires_at:?}"),
entry_removed = should_remove,
"processed a cache entry invalidation event"
);
}
/// Try retrieving an entry by its key, then execute `extract` if it exists.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn get_raw<Q, R>(&self, key: &Q, extract: impl FnOnce(&K, &Entry<V>) -> R) -> Option<R>
where
K: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{
let now = Instant::now();
let deadline = now.checked_add(self.ttl).expect("time overflow");
// Do costly things before taking the lock.
let mut cache = self.cache.lock();
let mut raw_entry = match cache.raw_entry_mut().from_key(key) {
RawEntryMut::Vacant(_) => return None,
RawEntryMut::Occupied(x) => x,
};
// Immeditely drop the entry if it has expired.
let entry = raw_entry.get();
if entry.expires_at <= now {
raw_entry.remove();
return None;
}
let value = extract(raw_entry.key(), entry);
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
// Update the deadline and the entry's position in the LRU list.
if self.update_ttl_on_retrieval {
raw_entry.get_mut().expires_at = deadline;
}
raw_entry.to_back();
drop(cache); // drop lock before logging
debug!(
created_at = format_args!("{created_at:?}"),
old_expires_at = format_args!("{expires_at:?}"),
new_expires_at = format_args!("{deadline:?}"),
"accessed a cache entry"
);
Some(value)
}
/// Insert an entry to the cache. If an entry with the same key already
/// existed, return the previous value and its creation timestamp.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
let created_at = Instant::now();
let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
let entry = Entry {
created_at,
expires_at,
value,
};
// Do costly things before taking the lock.
let old = self
.cache
.lock()
.insert(key, entry)
.map(|entry| entry.value);
debug!(
created_at = format_args!("{created_at:?}"),
expires_at = format_args!("{expires_at:?}"),
replaced = old.is_some(),
"created a cache entry"
);
(created_at, old)
}
}
impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
let (created_at, old) = self.insert_raw(key.clone(), value.clone());
let cached = Cached {
token: Some((self, LookupInfo { created_at, key })),
value,
};
(old, cached)
}
}
impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
/// Retrieve a cached entry in convenient wrapper.
pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
where
K: Borrow<Q> + Clone,
Q: Hash + Eq + ?Sized,
{
self.get_raw(key, |key, entry| {
let info = LookupInfo {
created_at: entry.created_at,
key: key.clone(),
};
Cached {
token: Some((self, info)),
value: entry.value.clone(),
}
})
}
}
/// Lookup information for key invalidation.
pub struct LookupInfo<K> {
/// Time of creation of a cache [`Entry`].
/// We use this during invalidation lookups to prevent eviction of a newer
/// entry sharing the same key (it might've been inserted by a different
/// task after we got the entry we're trying to invalidate now).
created_at: Instant,
/// Search by this key.
key: K,
}
/// Wrapper for convenient entry invalidation.
pub struct Cached<C: Cache> {
/// Cache + lookup info.
token: Option<(C, C::LookupInfo<C::Key>)>,
/// The value itself.
value: C::Value,
}
impl<C: Cache> Cached<C> {
/// Place any entry into this wrapper; invalidation will be a no-op.
pub fn new_uncached(value: C::Value) -> Self {
Self { token: None, value }
}
/// Drop this entry from a cache if it's still there.
pub fn invalidate(self) -> C::Value {
if let Some((cache, info)) = &self.token {
cache.invalidate(info);
}
self.value
}
/// Tell if this entry is actually cached.
pub fn cached(&self) -> bool {
self.token.is_some()
}
}
impl<C: Cache> Deref for Cached<C> {
type Target = C::Value;
fn deref(&self) -> &Self::Target {
&self.value
}
}
impl<C: Cache> DerefMut for Cached<C> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.value
}
}
}

View File

@@ -1,72 +0,0 @@
use std::ops::{Deref, DerefMut};
/// A generic trait which exposes types of cache's key and value,
/// as well as the notion of cache entry invalidation.
/// This is useful for [`Cached`].
pub trait Cache {
/// Entry's key.
type Key;
/// Entry's value.
type Value;
/// Used for entry invalidation.
type LookupInfo<Key>;
/// Invalidate an entry using a lookup info.
/// We don't have an empty default impl because it's error-prone.
fn invalidate(&self, _: &Self::LookupInfo<Self::Key>);
}
impl<C: Cache> Cache for &C {
type Key = C::Key;
type Value = C::Value;
type LookupInfo<Key> = C::LookupInfo<Key>;
fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
C::invalidate(self, info)
}
}
/// Wrapper for convenient entry invalidation.
pub struct Cached<C: Cache, V = <C as Cache>::Value> {
/// Cache + lookup info.
pub token: Option<(C, C::LookupInfo<C::Key>)>,
/// The value itself.
pub value: V,
}
impl<C: Cache, V> Cached<C, V> {
/// Place any entry into this wrapper; invalidation will be a no-op.
pub fn new_uncached(value: V) -> Self {
Self { token: None, value }
}
/// Drop this entry from a cache if it's still there.
pub fn invalidate(self) -> V {
if let Some((cache, info)) = &self.token {
cache.invalidate(info);
}
self.value
}
/// Tell if this entry is actually cached.
pub fn cached(&self) -> bool {
self.token.is_some()
}
}
impl<C: Cache, V> Deref for Cached<C, V> {
type Target = V;
fn deref(&self) -> &Self::Target {
&self.value
}
}
impl<C: Cache, V> DerefMut for Cached<C, V> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.value
}
}

View File

@@ -1,496 +0,0 @@
use std::{
collections::HashSet,
convert::Infallible,
sync::{atomic::AtomicU64, Arc},
time::Duration,
};
use dashmap::DashMap;
use rand::{thread_rng, Rng};
use smol_str::SmolStr;
use tokio::time::Instant;
use tracing::{debug, info};
use crate::{config::ProjectInfoCacheOptions, console::AuthSecret};
use super::{Cache, Cached};
pub trait ProjectInfoCache {
fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
fn enable_ttl(&self);
fn disable_ttl(&self);
}
struct Entry<T> {
created_at: Instant,
value: T,
}
impl<T> Entry<T> {
pub fn new(value: T) -> Self {
Self {
created_at: Instant::now(),
value,
}
}
}
impl<T> From<T> for Entry<T> {
fn from(value: T) -> Self {
Self::new(value)
}
}
#[derive(Default)]
struct EndpointInfo {
secret: std::collections::HashMap<SmolStr, Entry<AuthSecret>>,
allowed_ips: Option<Entry<Arc<Vec<SmolStr>>>>,
}
impl EndpointInfo {
fn check_ignore_cache(ignore_cache_since: Option<Instant>, created_at: Instant) -> bool {
match ignore_cache_since {
None => false,
Some(t) => t < created_at,
}
}
pub fn get_role_secret(
&self,
role_name: &SmolStr,
valid_since: Instant,
ignore_cache_since: Option<Instant>,
) -> Option<(AuthSecret, bool)> {
if let Some(secret) = self.secret.get(role_name) {
if valid_since < secret.created_at {
return Some((
secret.value.clone(),
Self::check_ignore_cache(ignore_cache_since, secret.created_at),
));
}
}
None
}
pub fn get_allowed_ips(
&self,
valid_since: Instant,
ignore_cache_since: Option<Instant>,
) -> Option<(Arc<Vec<SmolStr>>, bool)> {
if let Some(allowed_ips) = &self.allowed_ips {
if valid_since < allowed_ips.created_at {
return Some((
allowed_ips.value.clone(),
Self::check_ignore_cache(ignore_cache_since, allowed_ips.created_at),
));
}
}
None
}
pub fn invalidate_allowed_ips(&mut self) {
self.allowed_ips = None;
}
pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
self.secret.remove(role_name);
}
}
/// Cache for project info.
/// This is used to cache auth data for endpoints.
/// Invalidation is done by console notifications or by TTL (if console notifications are disabled).
///
/// We also store endpoint-to-project mapping in the cache, to be able to access per-endpoint data.
/// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
/// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
pub struct ProjectInfoCacheImpl {
cache: DashMap<SmolStr, EndpointInfo>,
project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
config: ProjectInfoCacheOptions,
start_time: Instant,
ttl_disabled_since_us: AtomicU64,
}
impl ProjectInfoCache for ProjectInfoCacheImpl {
fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
info!("invalidating allowed ips for project `{}`", project_id);
let endpoints = self
.project2ep
.get(project_id)
.map(|kv| kv.value().clone())
.unwrap_or_default();
for endpoint_id in endpoints {
if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
endpoint_info.invalidate_allowed_ips();
}
}
}
fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
info!(
"invalidating role secret for project_id `{}` and role_name `{}`",
project_id, role_name
);
let endpoints = self
.project2ep
.get(project_id)
.map(|kv| kv.value().clone())
.unwrap_or_default();
for endpoint_id in endpoints {
if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
endpoint_info.invalidate_role_secret(role_name);
}
}
}
fn enable_ttl(&self) {
self.ttl_disabled_since_us
.store(u64::MAX, std::sync::atomic::Ordering::Relaxed);
}
fn disable_ttl(&self) {
let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
self.ttl_disabled_since_us
.store(new_ttl, std::sync::atomic::Ordering::Relaxed);
}
}
impl ProjectInfoCacheImpl {
pub fn new(config: ProjectInfoCacheOptions) -> Self {
Self {
cache: DashMap::new(),
project2ep: DashMap::new(),
config,
ttl_disabled_since_us: AtomicU64::new(u64::MAX),
start_time: Instant::now(),
}
}
pub fn get_role_secret(
&self,
endpoint_id: &SmolStr,
role_name: &SmolStr,
) -> Option<Cached<&Self, AuthSecret>> {
let (valid_since, ignore_cache_since) = self.get_cache_times();
let endpoint_info = self.cache.get(endpoint_id)?;
let (value, ignore_cache) =
endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?;
if !ignore_cache {
let cached = Cached {
token: Some((
self,
CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()),
)),
value,
};
return Some(cached);
}
Some(Cached::new_uncached(value))
}
pub fn get_allowed_ips(
&self,
endpoint_id: &SmolStr,
) -> Option<Cached<&Self, Arc<Vec<SmolStr>>>> {
let (valid_since, ignore_cache_since) = self.get_cache_times();
let endpoint_info = self.cache.get(endpoint_id)?;
let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
let (value, ignore_cache) = value?;
if !ignore_cache {
let cached = Cached {
token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))),
value,
};
return Some(cached);
}
Some(Cached::new_uncached(value))
}
pub fn insert_role_secret(
&self,
project_id: &SmolStr,
endpoint_id: &SmolStr,
role_name: &SmolStr,
secret: AuthSecret,
) {
if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle.
return;
}
self.inser_project2endpoint(project_id, endpoint_id);
let mut entry = self.cache.entry(endpoint_id.clone()).or_default();
if entry.secret.len() < self.config.max_roles {
entry.secret.insert(role_name.clone(), secret.into());
}
}
pub fn insert_allowed_ips(
&self,
project_id: &SmolStr,
endpoint_id: &SmolStr,
allowed_ips: Arc<Vec<SmolStr>>,
) {
if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle.
return;
}
self.inser_project2endpoint(project_id, endpoint_id);
self.cache
.entry(endpoint_id.clone())
.or_default()
.allowed_ips = Some(allowed_ips.into());
}
fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
endpoints.insert(endpoint_id.clone());
} else {
self.project2ep
.insert(project_id.clone(), HashSet::from([endpoint_id.clone()]));
}
}
fn get_cache_times(&self) -> (Instant, Option<Instant>) {
let mut valid_since = Instant::now() - self.config.ttl;
// Only ignore cache if ttl is disabled.
let ttl_disabled_since_us = self
.ttl_disabled_since_us
.load(std::sync::atomic::Ordering::Relaxed);
let ignore_cache_since = if ttl_disabled_since_us != u64::MAX {
let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us);
// We are fine if entry is not older than ttl or was added before we are getting notifications.
valid_since = valid_since.min(ignore_cache_since);
Some(ignore_cache_since)
} else {
None
};
(valid_since, ignore_cache_since)
}
pub async fn gc_worker(&self) -> anyhow::Result<Infallible> {
let mut interval =
tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32);
loop {
interval.tick().await;
if self.cache.len() <= self.config.size {
// If there are not too many entries, wait until the next gc cycle.
continue;
}
self.gc();
}
}
fn gc(&self) {
let shard = thread_rng().gen_range(0..self.project2ep.shards().len());
debug!(shard, "project_info_cache: performing epoch reclamation");
// acquire a random shard lock
let mut removed = 0;
let shard = self.project2ep.shards()[shard].write();
for (_, endpoints) in shard.iter() {
for endpoint in endpoints.get().iter() {
self.cache.remove(endpoint);
removed += 1;
}
}
// We can drop this shard only after making sure that all endpoints are removed.
drop(shard);
info!("project_info_cache: removed {removed} endpoints");
}
}
/// Lookup info for project info cache.
/// This is used to invalidate cache entries.
pub struct CachedLookupInfo {
/// Search by this key.
endpoint_id: SmolStr,
lookup_type: LookupType,
}
impl CachedLookupInfo {
pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
Self {
endpoint_id,
lookup_type: LookupType::RoleSecret(role_name),
}
}
pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
Self {
endpoint_id,
lookup_type: LookupType::AllowedIps,
}
}
}
enum LookupType {
RoleSecret(SmolStr),
AllowedIps,
}
impl Cache for ProjectInfoCacheImpl {
type Key = SmolStr;
// Value is not really used here, but we need to specify it.
type Value = SmolStr;
type LookupInfo<Key> = CachedLookupInfo;
fn invalidate(&self, key: &Self::LookupInfo<SmolStr>) {
match &key.lookup_type {
LookupType::RoleSecret(role_name) => {
if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
endpoint_info.invalidate_role_secret(role_name);
}
}
LookupType::AllowedIps => {
if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
endpoint_info.invalidate_allowed_ips();
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{console::AuthSecret, scram::ServerSecret};
use smol_str::SmolStr;
use std::{sync::Arc, time::Duration};
#[tokio::test]
async fn test_project_info_cache_settings() {
tokio::time::pause();
let cache = ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
size: 2,
max_roles: 2,
ttl: Duration::from_secs(1),
gc_interval: Duration::from_secs(600),
});
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: SmolStr = "user1".into();
let user2: SmolStr = "user2".into();
let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
assert!(cached.cached());
assert_eq!(cached.value, secret1);
let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
assert!(cached.cached());
assert_eq!(cached.value, secret2);
// Shouldn't add more than 2 roles.
let user3: SmolStr = "user3".into();
let secret3 = AuthSecret::Scram(ServerSecret::mock(user3.as_str(), [3; 32]));
cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
assert!(cached.cached());
assert_eq!(cached.value, allowed_ips);
tokio::time::advance(Duration::from_secs(2)).await;
let cached = cache.get_role_secret(&endpoint_id, &user1);
assert!(cached.is_none());
let cached = cache.get_role_secret(&endpoint_id, &user2);
assert!(cached.is_none());
let cached = cache.get_allowed_ips(&endpoint_id);
assert!(cached.is_none());
}
#[tokio::test]
async fn test_project_info_cache_invalidations() {
tokio::time::pause();
let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
size: 2,
max_roles: 2,
ttl: Duration::from_secs(1),
gc_interval: Duration::from_secs(600),
}));
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_secs(2)).await;
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: SmolStr = "user1".into();
let user2: SmolStr = "user2".into();
let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
tokio::time::advance(Duration::from_secs(2)).await;
// Nothing should be invalidated.
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
// TTL is disabled, so it should be impossible to invalidate this value.
assert!(!cached.cached());
assert_eq!(cached.value, secret1);
cached.invalidate(); // Shouldn't do anything.
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
assert_eq!(cached.value, secret1);
let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
assert!(!cached.cached());
assert_eq!(cached.value, secret2);
// The only way to invalidate this value is to invalidate via the api.
cache.invalidate_role_secret_for_project(&project_id, &user2);
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
assert!(!cached.cached());
assert_eq!(cached.value, allowed_ips);
}
#[tokio::test]
async fn test_disable_ttl_invalidate_added_before() {
tokio::time::pause();
let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
size: 2,
max_roles: 2,
ttl: Duration::from_secs(1),
gc_interval: Duration::from_secs(600),
}));
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: SmolStr = "user1".into();
let user2: SmolStr = "user2".into();
let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_millis(100)).await;
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
// Added before ttl was disabled + ttl should be still cached.
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
assert!(cached.cached());
let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
assert!(cached.cached());
tokio::time::advance(Duration::from_secs(1)).await;
// Added before ttl was disabled + ttl should expire.
assert!(cache.get_role_secret(&endpoint_id, &user1).is_none());
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
// Added after ttl was disabled + ttl should not be cached.
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
assert!(!cached.cached());
tokio::time::advance(Duration::from_secs(1)).await;
// Added before ttl was disabled + ttl still should expire.
assert!(cache.get_role_secret(&endpoint_id, &user1).is_none());
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
// Shouldn't be invalidated.
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
assert!(!cached.cached());
assert_eq!(cached.value, allowed_ips);
}
}

View File

@@ -1,258 +0,0 @@
use std::{
borrow::Borrow,
hash::Hash,
time::{Duration, Instant},
};
use tracing::debug;
// This seems to make more sense than `lru` or `cached`:
//
// * `near/nearcore` ditched `cached` in favor of `lru`
// (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed).
//
// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs).
// This severely hinders its usage both in terms of creating wrappers and supported key types.
//
// On the other hand, `hashlink` has good download stats and appears to be maintained.
use hashlink::{linked_hash_map::RawEntryMut, LruCache};
use super::{common::Cached, *};
/// An implementation of timed LRU cache with fixed capacity.
/// Key properties:
///
/// * Whenever a new entry is inserted, the least recently accessed one is evicted.
/// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`).
///
/// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp.
/// If the entry has expired, we remove it from the cache; Otherwise we bump the
/// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong
/// its existence.
///
/// * There's an API for immediate invalidation (removal) of a cache entry;
/// It's useful in case we know for sure that the entry is no longer correct.
/// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information.
///
/// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
/// or by a successful lookup (i.e. the entry hasn't expired yet).
/// There is no background job to reap the expired records.
///
/// * It's possible for an entry that has not yet expired entry to be evicted
/// before expired items. That's a bit wasteful, but probably fine in practice.
pub struct TimedLru<K, V> {
/// Cache's name for tracing.
name: &'static str,
/// The underlying cache implementation.
cache: parking_lot::Mutex<LruCache<K, Entry<V>>>,
/// Default time-to-live of a single entry.
ttl: Duration,
update_ttl_on_retrieval: bool,
}
impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
type Key = K;
type Value = V;
type LookupInfo<Key> = LookupInfo<Key>;
fn invalidate(&self, info: &Self::LookupInfo<K>) {
self.invalidate_raw(info)
}
}
struct Entry<T> {
created_at: Instant,
expires_at: Instant,
value: T,
}
impl<K: Hash + Eq, V> TimedLru<K, V> {
/// Construct a new LRU cache with timed entries.
pub fn new(
name: &'static str,
capacity: usize,
ttl: Duration,
update_ttl_on_retrieval: bool,
) -> Self {
Self {
name,
cache: LruCache::new(capacity).into(),
ttl,
update_ttl_on_retrieval,
}
}
/// Drop an entry from the cache if it's outdated.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn invalidate_raw(&self, info: &LookupInfo<K>) {
let now = Instant::now();
// Do costly things before taking the lock.
let mut cache = self.cache.lock();
let raw_entry = match cache.raw_entry_mut().from_key(&info.key) {
RawEntryMut::Vacant(_) => return,
RawEntryMut::Occupied(x) => x,
};
// Remove the entry if it was created prior to lookup timestamp.
let entry = raw_entry.get();
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
let should_remove = created_at <= info.created_at || expires_at <= now;
if should_remove {
raw_entry.remove();
}
drop(cache); // drop lock before logging
debug!(
created_at = format_args!("{created_at:?}"),
expires_at = format_args!("{expires_at:?}"),
entry_removed = should_remove,
"processed a cache entry invalidation event"
);
}
/// Try retrieving an entry by its key, then execute `extract` if it exists.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn get_raw<Q, R>(&self, key: &Q, extract: impl FnOnce(&K, &Entry<V>) -> R) -> Option<R>
where
K: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{
let now = Instant::now();
let deadline = now.checked_add(self.ttl).expect("time overflow");
// Do costly things before taking the lock.
let mut cache = self.cache.lock();
let mut raw_entry = match cache.raw_entry_mut().from_key(key) {
RawEntryMut::Vacant(_) => return None,
RawEntryMut::Occupied(x) => x,
};
// Immeditely drop the entry if it has expired.
let entry = raw_entry.get();
if entry.expires_at <= now {
raw_entry.remove();
return None;
}
let value = extract(raw_entry.key(), entry);
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
// Update the deadline and the entry's position in the LRU list.
if self.update_ttl_on_retrieval {
raw_entry.get_mut().expires_at = deadline;
}
raw_entry.to_back();
drop(cache); // drop lock before logging
debug!(
created_at = format_args!("{created_at:?}"),
old_expires_at = format_args!("{expires_at:?}"),
new_expires_at = format_args!("{deadline:?}"),
"accessed a cache entry"
);
Some(value)
}
/// Insert an entry to the cache. If an entry with the same key already
/// existed, return the previous value and its creation timestamp.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
let created_at = Instant::now();
let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
let entry = Entry {
created_at,
expires_at,
value,
};
// Do costly things before taking the lock.
let old = self
.cache
.lock()
.insert(key, entry)
.map(|entry| entry.value);
debug!(
created_at = format_args!("{created_at:?}"),
expires_at = format_args!("{expires_at:?}"),
replaced = old.is_some(),
"created a cache entry"
);
(created_at, old)
}
}
impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
let (created_at, old) = self.insert_raw(key.clone(), value.clone());
let cached = Cached {
token: Some((self, LookupInfo { created_at, key })),
value,
};
(old, cached)
}
}
impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
/// Retrieve a cached entry in convenient wrapper.
pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
where
K: Borrow<Q> + Clone,
Q: Hash + Eq + ?Sized,
{
self.get_raw(key, |key, entry| {
let info = LookupInfo {
created_at: entry.created_at,
key: key.clone(),
};
Cached {
token: Some((self, info)),
value: entry.value.clone(),
}
})
}
/// Retrieve a cached entry in convenient wrapper, ignoring its TTL.
pub fn get_ignoring_ttl<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
where
K: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{
let mut cache = self.cache.lock();
cache
.get(key)
.map(|entry| Cached::new_uncached(entry.value.clone()))
}
/// Remove an entry from the cache.
pub fn remove<Q>(&self, key: &Q) -> Option<V>
where
K: Borrow<Q> + Clone,
Q: Hash + Eq + ?Sized,
{
let mut cache = self.cache.lock();
cache.remove(key).map(|entry| entry.value)
}
}
/// Lookup information for key invalidation.
pub struct LookupInfo<K> {
/// Time of creation of a cache [`Entry`].
/// We use this during invalidation lookups to prevent eviction of a newer
/// entry sharing the same key (it might've been inserted by a different
/// task after we got the entry we're trying to invalidate now).
created_at: Instant,
/// Search by this key.
key: K,
}

View File

@@ -1,7 +1,6 @@
use crate::{
auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE,
proxy::neon_option,
error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
@@ -39,17 +38,7 @@ impl UserFacingError for ConnectionError {
// This helps us drop irrelevant library-specific prefixes.
// TODO: propagate severity level and other parameters.
Postgres(err) => match err.as_db_error() {
Some(err) => {
let msg = err.message();
if msg.starts_with("unsupported startup parameter: ")
|| msg.starts_with("unsupported startup parameter in options: ")
{
format!("{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter")
} else {
msg.to_owned()
}
}
Some(err) => err.message().to_owned(),
None => err.to_string(),
},
WakeComputeError(err) => err.to_string_client(),
@@ -243,9 +232,9 @@ impl ConnCfg {
/// Connect to a corresponding compute node.
pub async fn connect(
&self,
ctx: &mut RequestMonitoring,
allow_self_signed_compute: bool,
timeout: Duration,
proto: &'static str,
) -> Result<PostgresConnection, ConnectionError> {
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -279,9 +268,7 @@ impl ConnCfg {
stream,
params,
cancel_closure,
_guage: NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&[ctx.protocol])
.guard(),
_guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
};
Ok(connection)

Some files were not shown because too many files have changed in this diff Show More