Compare commits

..

3 Commits

Author SHA1 Message Date
John Spray
b0e3edda2e convenience ocmmand for setting threshold eviction 2024-05-09 10:05:30 +01:00
John Spray
1117b0f429 Add pageserver-enable-heatmaps 2024-05-07 09:46:04 +01:00
John Spray
c8379f0128 storcon_cli: add tenant-drop 2024-04-23 19:38:00 +01:00
135 changed files with 2253 additions and 7181 deletions

View File

@@ -477,8 +477,6 @@ jobs:
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: true
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
@@ -558,9 +556,6 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: false
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones

90
Cargo.lock generated
View File

@@ -722,9 +722,9 @@ dependencies = [
[[package]]
name = "azure_core"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
dependencies = [
"async-trait",
"base64 0.21.1",
@@ -752,9 +752,9 @@ dependencies = [
[[package]]
name = "azure_identity"
version = "0.19.0"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
dependencies = [
"async-lock",
"async-trait",
@@ -772,9 +772,9 @@ dependencies = [
[[package]]
name = "azure_storage"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
dependencies = [
"RustyXML",
"async-lock",
@@ -791,9 +791,9 @@ dependencies = [
[[package]]
name = "azure_storage_blobs"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
dependencies = [
"RustyXML",
"azure_core",
@@ -812,9 +812,9 @@ dependencies = [
[[package]]
name = "azure_svc_blobstorage"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
dependencies = [
"azure_core",
"bytes",
@@ -1319,7 +1319,6 @@ dependencies = [
"git-version",
"hex",
"humantime",
"humantime-serde",
"hyper 0.14.26",
"nix 0.27.1",
"once_cell",
@@ -2764,9 +2763,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.69"
version = "0.3.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
dependencies = [
"wasm-bindgen",
]
@@ -3185,16 +3184,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -3531,12 +3520,6 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -5102,7 +5085,6 @@ dependencies = [
"aws-smithy-async",
"bincode",
"bytes",
"camino",
"chrono",
"clap",
"crc32c",
@@ -5112,11 +5094,8 @@ dependencies = [
"hex",
"histogram",
"itertools",
"native-tls",
"pageserver",
"pageserver_api",
"postgres-native-tls",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest",
@@ -5125,10 +5104,8 @@ dependencies = [
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-appender",
"tracing-subscriber",
@@ -5797,6 +5774,7 @@ dependencies = [
"anyhow",
"clap",
"comfy-table",
"humantime",
"hyper 0.14.26",
"pageserver_api",
"pageserver_client",
@@ -6435,10 +6413,11 @@ dependencies = [
[[package]]
name = "tracing"
version = "0.1.40"
version = "0.1.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
dependencies = [
"cfg-if",
"log",
"pin-project-lite",
"tracing-attributes",
@@ -6458,9 +6437,9 @@ dependencies = [
[[package]]
name = "tracing-attributes"
version = "0.1.27"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
dependencies = [
"proc-macro2",
"quote",
@@ -6469,9 +6448,9 @@ dependencies = [
[[package]]
name = "tracing-core"
version = "0.1.32"
version = "0.1.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
dependencies = [
"once_cell",
"valuable",
@@ -6529,7 +6508,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",
@@ -6927,9 +6905,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
version = "0.2.92"
version = "0.2.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
@@ -6937,9 +6915,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.92"
version = "0.2.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
dependencies = [
"bumpalo",
"log",
@@ -6952,9 +6930,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.42"
version = "0.4.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
dependencies = [
"cfg-if",
"js-sys",
@@ -6964,9 +6942,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.92"
version = "0.2.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@@ -6974,9 +6952,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.92"
version = "0.2.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
dependencies = [
"proc-macro2",
"quote",
@@ -6987,9 +6965,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.92"
version = "0.2.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
[[package]]
name = "wasm-streams"
@@ -7021,9 +6999,9 @@ dependencies = [
[[package]]
name = "web-sys"
version = "0.3.69"
version = "0.3.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
dependencies = [
"js-sys",
"wasm-bindgen",

View File

@@ -45,10 +45,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = "0.19"
azure_identity = "0.19"
azure_storage = "0.19"
azure_storage_blobs = "0.19"
azure_core = "0.18"
azure_identity = "0.18"
azure_storage = "0.18"
azure_storage_blobs = "0.18"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.20.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"
urlencoding = "2.1"

View File

@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux
PG_CONFIGURE_OPTS += --with-libseccomp
else ifeq ($(UNAME_S),Darwin)
ifndef DISABLE_HOMEBREW
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
# Use -C option so that when PostgreSQL "make install" installs the

View File

@@ -51,7 +51,6 @@ use tracing::{error, info};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -69,29 +68,6 @@ use compute_tools::spec::*;
const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
let (build_tag, clap_args) = init()?;
let (pg_handle, start_pg_result) =
{
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_result = process_cli(&clap_args)?;
let wait_spec_result = wait_spec(build_tag, cli_result)?;
start_postgres(&clap_args, wait_spec_result)?
// Startup is finished, exit the startup tracing context
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
cleanup_and_exit(start_pg_result, wait_pg_result)
}
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -106,11 +82,35 @@ fn init() -> Result<(String, clap::ArgMatches)> {
.to_string();
info!("build_tag: {build_tag}");
Ok((build_tag, cli().get_matches()))
}
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
let connstr = matches
.get_one::<String>("connstr")
.expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
{
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -147,7 +147,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
@@ -157,42 +157,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
Some(guard)
} else {
None
}
}
};
fn process_cli(
matches: &clap::ArgMatches,
) -> Result<ProcessCliResult> {
let pgbin_default = "postgres";
let pgbin = matches
.get_one::<String>("pgbin")
.map(|s| s.as_str())
.unwrap_or(pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
let connstr = matches
.get_one::<String>("connstr")
.expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
@@ -233,45 +199,6 @@ fn process_cli(
}
};
let result = ProcessCliResult {
// directly from CLI:
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
// others:
spec,
live_config_allowed,
};
Ok(result)
}
struct ProcessCliResult<'clap> {
connstr: &'clap str,
pgdata: &'clap str,
pgbin: &'clap str,
ext_remote_storage: Option<&'clap str>,
http_port: u16,
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
live_config_allowed: bool,
}
fn wait_spec(
build_tag: String,
ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
spec,
live_config_allowed,
}: ProcessCliResult,
) -> Result<WaitSpecResult> {
let mut new_state = ComputeState::new();
let spec_set;
@@ -310,6 +237,8 @@ fn wait_spec(
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
@@ -326,19 +255,6 @@ fn wait_spec(
}
}
Ok(WaitSpecResult { compute, http_port })
}
struct WaitSpecResult {
compute: Arc<ComputeNode>,
// passed through from ProcessCliResult
http_port: u16,
}
fn start_postgres(
matches: &clap::ArgMatches,
WaitSpecResult { compute, http_port }: WaitSpecResult,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
@@ -365,10 +281,9 @@ fn start_postgres(
let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute);
let extension_server_port: u16 = http_port;
// Start Postgres
let mut delay_exit = false;
let mut exit_code = None;
let pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
Err(err) => {
@@ -419,7 +334,7 @@ fn start_postgres(
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
let vm_monitor = rt.as_ref().map(|rt| {
let vm_monitor = &rt.as_ref().map(|rt| {
rt.spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(),
@@ -432,43 +347,12 @@ fn start_postgres(
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
rt,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
rt: Option<tokio::runtime::Runtime>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(
pg: Option<PostgresHandle>,
) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
@@ -483,26 +367,6 @@ fn wait_postgres(
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_and_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
rt,
}: StartPostgresResult,
WaitPostgresResult { exit_code }: WaitPostgresResult,
) -> Result<()> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.

View File

@@ -17,7 +17,6 @@ nix.workspace = true
once_cell.workspace = true
postgres.workspace = true
hex.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }

View File

@@ -417,54 +417,6 @@ async fn handle_tenant(
println!("{} {:?}", t.id, t.state);
}
}
Some(("import", import_match)) => {
let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
let storage_controller = StorageController::from_env(env);
let create_response = storage_controller.tenant_import(tenant_id).await?;
let shard_zero = create_response
.shards
.first()
.expect("Import response omitted shards");
let attached_pageserver_id = shard_zero.node_id;
let pageserver =
PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
println!(
"Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
);
let timelines = pageserver
.http_client
.list_timelines(shard_zero.shard_id)
.await?;
// Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
let main_timeline = timelines
.iter()
.find(|t| t.ancestor_timeline_id.is_none())
.expect("No timelines found")
.timeline_id;
let mut branch_i = 0;
for timeline in timelines.iter() {
let branch_name = if timeline.timeline_id == main_timeline {
"main".to_string()
} else {
branch_i += 1;
format!("branch_{branch_i}")
};
println!(
"Importing timeline {tenant_id}/{} as branch {branch_name}",
timeline.timeline_id
);
env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
}
}
Some(("create", create_match)) => {
let tenant_conf: HashMap<_, _> = create_match
.get_many::<String>("config")
@@ -1528,8 +1480,6 @@ fn cli() -> Command {
.subcommand(Command::new("config")
.arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
.subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
.about("Import a tenant that is present in remote storage, and create branches for its timelines"))
)
.subcommand(
Command::new("pageserver")
@@ -1554,8 +1504,8 @@ fn cli() -> Command {
Command::new("storage_controller")
.arg_required_else_help(true)
.about("Manage storage_controller")
.subcommand(Command::new("start").about("Start storage controller"))
.subcommand(Command::new("stop").about("Stop storage controller")
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
.subcommand(Command::new("stop").about("Stop local pageserver")
.arg(stop_mode_arg.clone()))
)
.subcommand(

View File

@@ -17,7 +17,6 @@ use std::net::Ipv4Addr;
use std::net::SocketAddr;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::time::Duration;
use utils::{
auth::{encode_from_key_file, Claims},
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -67,10 +66,6 @@ pub struct LocalEnv {
pub broker: NeonBroker,
// Configuration for the storage controller (1 per neon_local environment)
#[serde(default)]
pub storage_controller: NeonStorageControllerConf,
/// This Vec must always contain at least one pageserver
pub pageservers: Vec<PageServerConf>,
@@ -103,29 +98,6 @@ pub struct NeonBroker {
pub listen_addr: SocketAddr,
}
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct NeonStorageControllerConf {
/// Heartbeat timeout before marking a node offline
#[serde(with = "humantime_serde")]
pub max_unavailable: Duration,
}
impl NeonStorageControllerConf {
// Use a shorter pageserver unavailability interval than the default to speed up tests.
const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
std::time::Duration::from_secs(10);
}
impl Default for NeonStorageControllerConf {
fn default() -> Self {
Self {
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
}
}
}
// Dummy Default impl to satisfy Deserialize derive.
impl Default for NeonBroker {
fn default() -> Self {
@@ -157,8 +129,6 @@ pub struct PageServerConf {
pub(crate) virtual_file_io_engine: Option<String>,
pub(crate) get_vectored_impl: Option<String>,
pub(crate) get_impl: Option<String>,
pub(crate) validate_vectored_get: Option<bool>,
}
impl Default for PageServerConf {
@@ -171,8 +141,6 @@ impl Default for PageServerConf {
http_auth_type: AuthType::Trust,
virtual_file_io_engine: None,
get_vectored_impl: None,
get_impl: None,
validate_vectored_get: None,
}
}
}

View File

@@ -92,8 +92,6 @@ impl PageServerNode {
http_auth_type,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
validate_vectored_get,
} = &self.conf;
let id = format!("id={}", id);
@@ -113,16 +111,6 @@ impl PageServerNode {
} else {
String::new()
};
let get_impl = if let Some(get_impl) = get_impl {
format!("get_impl='{get_impl}'")
} else {
String::new()
};
let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
format!("validate_vectored_get={validate_vectored_get}")
} else {
String::new()
};
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
@@ -136,8 +124,6 @@ impl PageServerNode {
broker_endpoint_param,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
validate_vectored_get,
];
if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -448,11 +434,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -571,11 +552,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
}
};

View File

@@ -1,7 +1,4 @@
use crate::{
background_process,
local_env::{LocalEnv, NeonStorageControllerConf},
};
use crate::{background_process, local_env::LocalEnv};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Method;
use pageserver_api::{
@@ -35,13 +32,15 @@ pub struct StorageController {
public_key: Option<String>,
postgres_port: u16,
client: reqwest::Client,
config: NeonStorageControllerConf,
}
const COMMAND: &str = "storage_controller";
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
// Use a shorter pageserver unavailability interval than the default to speed up tests.
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
config: env.storage_controller.clone(),
}
}
@@ -274,6 +272,8 @@ impl StorageController {
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
let mut args = vec![
"-l",
&self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
"--database-url",
&database_url,
"--max-unavailable-interval",
&humantime::Duration::from(self.config.max_unavailable).to_string(),
&max_unavailable.to_string(),
]
.into_iter()
.map(|s| s.to_string())
@@ -472,16 +472,6 @@ impl StorageController {
.await
}
#[instrument(skip(self))]
pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
self.dispatch::<(), TenantCreateResponse>(
Method::POST,
format!("debug/v1/tenant/{tenant_id}/import"),
None,
)
.await
}
#[instrument(skip(self))]
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
self.dispatch::<(), _>(

View File

@@ -9,6 +9,7 @@ license.workspace = true
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
humantime.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true

View File

@@ -8,8 +8,9 @@ use pageserver_api::{
TenantDescribeResponse, TenantPolicyRequest,
},
models::{
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
@@ -126,6 +127,24 @@ enum Command {
#[arg(long)]
tenant_id: TenantId,
},
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
TenantDrop {
#[arg(long)]
tenant_id: TenantId,
},
PageserverEnableHeatmaps {
#[arg(long)]
tenant_id: TenantId,
},
TenantSetTimeBasedEviction {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
period: humantime::Duration,
#[arg(long)]
threshold: humantime::Duration,
},
}
#[derive(Parser)]
@@ -675,6 +694,84 @@ async fn main() -> anyhow::Result<()> {
}
}
}
Command::TenantDrop { tenant_id } => {
storcon_client
.dispatch::<(), ()>(
Method::POST,
format!("debug/v1/tenant/{tenant_id}/drop"),
None,
)
.await?;
}
Command::PageserverEnableHeatmaps { tenant_id } => {
vps_client
.tenant_config(&TenantConfigRequest {
tenant_id,
config: TenantConfig {
checkpoint_distance: None,
checkpoint_timeout: None,
compaction_target_size: None,
compaction_period: None,
compaction_threshold: None,
compaction_algorithm: None,
gc_horizon: None,
gc_period: None,
image_creation_threshold: None,
pitr_interval: None,
walreceiver_connect_timeout: None,
lagging_wal_timeout: None,
max_lsn_wal_lag: None,
trace_read_requests: None,
eviction_policy: None,
min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: None,
heatmap_period: Some("60s".to_string()),
lazy_slru_download: None,
timeline_get_throttle: None,
image_layer_creation_check_threshold: None,
},
})
.await?;
}
Command::TenantSetTimeBasedEviction {
tenant_id,
period,
threshold,
} => {
vps_client
.tenant_config(&TenantConfigRequest {
tenant_id,
config: TenantConfig {
checkpoint_distance: None,
checkpoint_timeout: None,
compaction_target_size: None,
compaction_period: None,
compaction_threshold: None,
compaction_algorithm: None,
gc_horizon: None,
gc_period: None,
image_creation_threshold: None,
pitr_interval: None,
walreceiver_connect_timeout: None,
lagging_wal_timeout: None,
max_lsn_wal_lag: None,
trace_read_requests: None,
eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
EvictionPolicyLayerAccessThreshold {
period: period.into(),
threshold: threshold.into(),
},
)),
min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: None,
heatmap_period: None,
lazy_slru_download: None,
timeline_get_throttle: None,
image_layer_creation_check_threshold: None,
},
})
.await?;
}
}
Ok(())

View File

@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
rebuilt on startup.
The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The `diesel` crate is used for defining models & migrations.

View File

@@ -256,16 +256,7 @@ fn update_rusage_metrics() {
DISK_IO_BYTES
.with_label_values(&["write"])
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
// On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
#[cfg(target_os = "macos")]
{
MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
}
#[cfg(not(target_os = "macos"))]
{
MAXRSS_KB.set(rusage_stats.ru_maxrss);
}
MAXRSS_KB.set(rusage_stats.ru_maxrss);
}
fn get_rusage_stats() -> libc::rusage {

View File

@@ -4,6 +4,7 @@ use bytes::BufMut;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::ops::RangeInclusive;
use std::{fmt, ops::Range};
use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -29,25 +30,24 @@ pub const KEY_SIZE: usize = 18;
/// See [`Key::to_i128`] for more information on the encoding.
pub const METADATA_KEY_SIZE: usize = 16;
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
/// The (reserved) key prefix of relation sizes.
pub const RELATION_SIZE_PREFIX: u8 = 0x61;
pub const RELATION_SIZE_PREFIX: u8 = 0x81;
/// The key prefix of AUX file keys.
pub const AUX_KEY_PREFIX: u8 = 0x62;
pub const AUX_KEY_PREFIX: u8 = 0x82;
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
key[0] >= METADATA_KEY_BEGIN_PREFIX
}
impl Key {
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key(&self) -> bool {
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
self.field1 >= METADATA_KEY_BEGIN_PREFIX
}
/// Encode a metadata key to a storage key.
@@ -80,7 +80,7 @@ impl Key {
}
/// Get the range of metadata keys.
pub fn metadata_key_range() -> Range<Self> {
pub fn metadata_key_range() -> RangeInclusive<Self> {
Key {
field1: METADATA_KEY_BEGIN_PREFIX,
field2: 0,
@@ -88,32 +88,13 @@ impl Key {
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: METADATA_KEY_END_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
/// Get the range of aux keys.
pub fn metadata_aux_key_range() -> Range<Self> {
Key {
field1: AUX_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: AUX_KEY_PREFIX + 1,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..=Key {
field1: u8::MAX,
field2: u16::MAX as u32,
field3: u32::MAX,
field4: u32::MAX,
field5: u8::MAX,
field6: u32::MAX,
}
}
@@ -122,7 +103,7 @@ impl Key {
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 {
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
(((self.field1 & 0x7F) as i128) << 120)
(((self.field1 & 0xf) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72)
| ((self.field4 as i128) << 40)
@@ -132,7 +113,7 @@ impl Key {
pub const fn from_i128(x: i128) -> Self {
Key {
field1: ((x >> 120) & 0x7F) as u8,
field1: ((x >> 120) & 0xf) as u8,
field2: ((x >> 104) & 0xFFFF) as u32,
field3: (x >> 72) as u32,
field4: (x >> 40) as u32,

View File

@@ -1,10 +1,7 @@
use postgres_ffi::BLCKSZ;
use std::ops::Range;
use crate::{
key::Key,
shard::{ShardCount, ShardIdentity},
};
use crate::key::Key;
use itertools::Itertools;
///
@@ -17,279 +14,44 @@ pub struct KeySpace {
pub ranges: Vec<Range<Key>>,
}
/// A wrapper type for sparse keyspaces.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct SparseKeySpace(pub KeySpace);
/// Represents a contiguous half-open range of the keyspace, masked according to a particular
/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
/// shard.
///
/// When we iterate over keys within this object, we will skip any keys that don't belong
/// to this shard.
///
/// The start + end keys may not belong to the shard: these specify where layer files should
/// start + end, but we will never actually read/write those keys.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ShardedRange<'a> {
pub shard_identity: &'a ShardIdentity,
pub range: Range<Key>,
}
// Calculate the size of a range within the blocks of the same relation, or spanning only the
// top page in the previous relation's space.
fn contiguous_range_len(range: &Range<Key>) -> u32 {
debug_assert!(is_contiguous_range(range));
if range.start.field6 == 0xffffffff {
range.end.field6 + 1
} else {
range.end.field6 - range.start.field6
}
}
/// Return true if this key range includes only keys in the same relation's data blocks, or
/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
///
/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
/// be on our shard. Later in ShardedRange we do the extra work to figure out how much
/// of a given contiguous range is present on one shard.
///
/// This matters, because:
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
fn is_contiguous_range(range: &Range<Key>) -> bool {
range.start.field1 == range.end.field1
&& range.start.field2 == range.end.field2
&& range.start.field3 == range.end.field3
&& range.start.field4 == range.end.field4
&& (range.start.field5 == range.end.field5
|| (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
}
impl<'a> ShardedRange<'a> {
pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
Self {
shard_identity,
range,
}
}
/// Break up this range into chunks, each of which has at least one local key in it if the
/// total range has at least one local key.
pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
// Optimization for single-key case (e.g. logical size keys)
if self.range.end == self.range.start.add(1) {
return vec![(
if self.shard_identity.is_key_disposable(&self.range.start) {
0
} else {
1
},
self.range,
)];
}
if !is_contiguous_range(&self.range) {
// Ranges that span relations are not fragmented. We only get these ranges as a result
// of operations that act on existing layers, so we trust that the existing range is
// reasonably small.
return vec![(u32::MAX, self.range)];
}
let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
let mut cursor = self.range.start;
while cursor < self.range.end {
let advance_by = self.distance_to_next_boundary(cursor);
let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
// If the previous fragment is undersized, then we seek to consume enough
// blocks to complete it.
let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
Some(frag) => {
// Prev block is complete, want the full number.
(
target_nblocks,
if is_fragment_disposable {
// If this current range will be empty (not shard-local data), we will merge into previous
Some(frag)
} else {
None
},
)
}
None => {
// First iteration, want the full number
(target_nblocks, None)
}
};
let advance_by = if is_fragment_disposable {
advance_by
} else {
std::cmp::min(advance_by, want_blocks)
};
let next_cursor = cursor.add(advance_by);
let this_frag = (
if is_fragment_disposable {
0
} else {
advance_by
},
cursor..next_cursor,
);
cursor = next_cursor;
if let Some(last_fragment) = merge_last_fragment {
// Previous fragment was short or this one is empty, merge into it
last_fragment.0 += this_frag.0;
last_fragment.1.end = this_frag.1.end;
} else {
fragments.push(this_frag);
}
}
fragments
}
/// Estimate the physical pages that are within this range, on this shard. This returns
/// u32::MAX if the range spans relations: this return value should be interpreted as "large".
pub fn page_count(&self) -> u32 {
// Special cases for single keys like logical sizes
if self.range.end == self.range.start.add(1) {
return if self.shard_identity.is_key_disposable(&self.range.start) {
0
} else {
1
};
}
// We can only do an authentic calculation of contiguous key ranges
if !is_contiguous_range(&self.range) {
return u32::MAX;
}
// Special case for single sharded tenants: our logical and physical sizes are the same
if self.shard_identity.count < ShardCount::new(2) {
return contiguous_range_len(&self.range);
}
// Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
// to Self, and add the stripe's block count to our total if so.
let mut result: u64 = 0;
let mut cursor = self.range.start;
while cursor < self.range.end {
// Count up to the next stripe_size boundary or end of range
let advance_by = self.distance_to_next_boundary(cursor);
// If this blocks in this stripe belong to us, add them to our count
if !self.shard_identity.is_key_disposable(&cursor) {
result += advance_by as u64;
}
cursor = cursor.add(advance_by);
}
if result > u32::MAX as u64 {
u32::MAX
} else {
result as u32
}
}
/// Advance the cursor to the next potential fragment boundary: this is either
/// a stripe boundary, or the end of the range.
fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
if self.shard_identity.count < ShardCount::new(2) {
// Optimization: don't bother stepping through stripes if the tenant isn't sharded.
return distance_to_range_end;
}
if cursor.field6 == 0xffffffff {
// We are wrapping from one relation's logical size to the next relation's first data block
return 1;
}
let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
let stripe_remainder = self.shard_identity.stripe_size.0
- (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
if cfg!(debug_assertions) {
// We should never overflow field5 and field6 -- our callers check this earlier
// and would have returned their u32::MAX cases if the input range violated this.
let next_cursor = cursor.add(stripe_remainder);
debug_assert!(
next_cursor.field1 == cursor.field1
&& next_cursor.field2 == cursor.field2
&& next_cursor.field3 == cursor.field3
&& next_cursor.field4 == cursor.field4
&& next_cursor.field5 == cursor.field5
)
}
std::cmp::min(stripe_remainder, distance_to_range_end)
}
/// Whereas `page_count` estimates the number of pages physically in this range on this shard,
/// this function simply calculates the number of pages in the space, without accounting for those
/// pages that would not actually be stored on this node.
///
/// Don't use this function in code that works with physical entities like layer files.
fn raw_size(range: &Range<Key>) -> u32 {
if is_contiguous_range(range) {
contiguous_range_len(range)
} else {
u32::MAX
}
}
}
impl KeySpace {
/// Create a key space with a single range.
pub fn single(key_range: Range<Key>) -> Self {
Self {
ranges: vec![key_range],
}
}
///
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
/// in each partition.
///
pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
pub fn partition(&self, target_size: u64) -> KeyPartitioning {
// Assume that each value is 8k in size.
let target_nblocks = (target_size / BLCKSZ as u64) as u32;
let target_nblocks = (target_size / BLCKSZ as u64) as usize;
let mut parts = Vec::new();
let mut current_part = Vec::new();
let mut current_part_size: usize = 0;
for range in &self.ranges {
// While doing partitioning, wrap the range in ShardedRange so that our size calculations
// will respect shard striping rather than assuming all keys within a range are present.
let range = ShardedRange::new(range.clone(), shard_identity);
// Chunk up the range into parts that each contain up to target_size local blocks
for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
// If appending the next contiguous range in the keyspace to the current
// partition would cause it to be too large, and our current partition
// covers at least one block that is physically present in this shard,
// then start a new partition
if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
&& current_part_size > 0
{
parts.push(KeySpace {
ranges: current_part,
});
current_part = Vec::new();
current_part_size = 0;
}
current_part.push(frag_range.start..frag_range.end);
current_part_size += frag_on_shard_size as usize;
// If appending the next contiguous range in the keyspace to the current
// partition would cause it to be too large, start a new partition.
let this_size = key_range_size(range) as usize;
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
parts.push(KeySpace {
ranges: current_part,
});
current_part = Vec::new();
current_part_size = 0;
}
// If the next range is larger than 'target_size', split it into
// 'target_size' chunks.
let mut remain_size = this_size;
let mut start = range.start;
while remain_size > target_nblocks {
let next = start.add(target_nblocks as u32);
parts.push(KeySpace {
ranges: vec![start..next],
});
start = next;
remain_size -= target_nblocks
}
current_part.push(start..range.end);
current_part_size += remain_size;
}
// add last partition that wasn't full yet.
@@ -302,10 +64,6 @@ impl KeySpace {
KeyPartitioning { parts }
}
pub fn is_empty(&self) -> bool {
self.total_raw_size() == 0
}
/// Merge another keyspace into the current one.
/// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) {
@@ -396,11 +154,11 @@ impl KeySpace {
self.ranges.last().map(|range| range.end)
}
/// The size of the keyspace in pages, before accounting for sharding
pub fn total_raw_size(&self) -> usize {
#[allow(unused)]
pub fn total_size(&self) -> usize {
self.ranges
.iter()
.map(|range| ShardedRange::raw_size(range) as usize)
.map(|range| key_range_size(range) as usize)
.sum()
}
@@ -420,11 +178,6 @@ impl KeySpace {
pub fn overlaps(&self, range: &Range<Key>) -> bool {
self.overlaps_at(range).is_some()
}
/// Check if the keyspace contains a key
pub fn contains(&self, key: &Key) -> bool {
self.overlaps(&(*key..key.next()))
}
}
///
@@ -439,33 +192,10 @@ pub struct KeyPartitioning {
pub parts: Vec<KeySpace>,
}
/// Represents a partitioning of the sparse key space.
#[derive(Clone, Debug, Default)]
pub struct SparseKeyPartitioning {
pub parts: Vec<SparseKeySpace>,
}
impl KeyPartitioning {
pub fn new() -> Self {
KeyPartitioning { parts: Vec::new() }
}
/// Convert a key partitioning to a sparse partition.
pub fn into_sparse(self) -> SparseKeyPartitioning {
SparseKeyPartitioning {
parts: self.parts.into_iter().map(SparseKeySpace).collect(),
}
}
}
impl SparseKeyPartitioning {
/// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
/// cause long/dead loops.
pub fn into_dense(self) -> KeyPartitioning {
KeyPartitioning {
parts: self.parts.into_iter().map(|x| x.0).collect(),
}
}
}
///
@@ -497,7 +227,7 @@ impl KeySpaceAccum {
#[inline(always)]
pub fn add_range(&mut self, range: Range<Key>) {
self.size += ShardedRange::raw_size(&range) as u64;
self.size += key_range_size(&range) as u64;
match self.accum.as_mut() {
Some(accum) => {
@@ -529,9 +259,7 @@ impl KeySpaceAccum {
std::mem::take(self).to_keyspace()
}
// The total number of keys in this object, ignoring any sharding effects that might cause some of
// the keys to be omitted in storage on this shard.
pub fn raw_size(&self) -> u64 {
pub fn size(&self) -> u64 {
self.size
}
}
@@ -587,19 +315,36 @@ impl KeySpaceRandomAccum {
}
}
#[inline(always)]
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
let start = key_range.start;
let end = key_range.end;
if end.field1 != start.field1
|| end.field2 != start.field2
|| end.field3 != start.field3
|| end.field4 != start.field4
{
return u32::MAX;
}
let start = (start.field5 as u64) << 32 | start.field6 as u64;
let end = (end.field5 as u64) << 32 | end.field6 as u64;
let diff = end - start;
if diff > u32::MAX as u64 {
u32::MAX
} else {
diff as u32
}
}
pub fn singleton_range(key: Key) -> Range<Key> {
key..key.next()
}
#[cfg(test)]
mod tests {
use rand::{RngCore, SeedableRng};
use crate::{
models::ShardParameters,
shard::{ShardCount, ShardNumber},
};
use super::*;
use std::fmt::Write;
@@ -642,17 +387,14 @@ mod tests {
accum.add_range(range.clone());
}
let expected_size: u64 = ranges
.iter()
.map(|r| ShardedRange::raw_size(r) as u64)
.sum();
assert_eq!(accum.raw_size(), expected_size);
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
assert_eq!(accum.size(), expected_size);
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
assert_eq!(accum.raw_size(), 0);
assert_eq!(accum.size(), 0);
assert_ks_eq(&accum.consume_keyspace(), vec![]);
assert_eq!(accum.raw_size(), 0);
assert_eq!(accum.size(), 0);
for range in &ranges {
accum.add_range(range.clone());
@@ -949,412 +691,4 @@ mod tests {
]
);
}
#[test]
fn sharded_range_relation_gap() {
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
},
&shard_identity,
);
// Key range spans relations, expect MAX
assert_eq!(range.page_count(), u32::MAX);
}
#[test]
fn shard_identity_keyspaces_single_key() {
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
},
&shard_identity,
);
// Single-key range on logical size key
assert_eq!(range.page_count(), 1);
}
/// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
#[test]
fn contiguous_range_check() {
assert!(!is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
),);
// The ranges goes all the way up to the 0xffffffff, including it: this is
// not considered a rel block range because 0xffffffff stores logical sizes,
// not blocks.
assert!(!is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
),);
// Keys within the normal data region of a relation
assert!(is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
),);
// The logical size key of one forkno, then some blocks in the next
assert!(is_contiguous_range(
&(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
),);
}
#[test]
fn shard_identity_keyspaces_forkno_gap() {
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
},
&shard_identity,
);
// Range spanning the end of one forkno and the start of the next: we do not attempt to
// calculate a valid size, because we have no way to know if they keys between start
// and end are actually in use.
assert_eq!(range.page_count(), u32::MAX);
}
#[test]
fn shard_identity_keyspaces_one_relation() {
for shard_number in 0..4 {
let shard_identity = ShardIdentity::new(
ShardNumber(shard_number),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
let range = ShardedRange::new(
Range {
start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
},
&shard_identity,
);
// Very simple case: range covering block zero of one relation, where that block maps to shard zero
if shard_number == 0 {
assert_eq!(range.page_count(), 1);
} else {
// Other shards should perceive the range's size as zero
assert_eq!(range.page_count(), 0);
}
}
}
/// Test helper: construct a ShardedRange and call fragment() on it, returning
/// the total page count in the range and the fragments.
fn do_fragment(
range_start: Key,
range_end: Key,
shard_identity: &ShardIdentity,
target_nblocks: u32,
) -> (u32, Vec<(u32, Range<Key>)>) {
let range = ShardedRange::new(
Range {
start: range_start,
end: range_end,
},
shard_identity,
);
let page_count = range.page_count();
let fragments = range.fragment(target_nblocks);
// Invariant: we always get at least one fragment
assert!(!fragments.is_empty());
// Invariant: the first/last fragment start/end should equal the input start/end
assert_eq!(fragments.first().unwrap().1.start, range_start);
assert_eq!(fragments.last().unwrap().1.end, range_end);
if page_count > 0 {
// Invariant: every fragment must contain at least one shard-local page, if the
// total range contains at least one shard-local page
let all_nonzero = fragments.iter().all(|f| f.0 > 0);
if !all_nonzero {
eprintln!("Found a zero-length fragment: {:?}", fragments);
}
assert!(all_nonzero);
} else {
// A range with no shard-local pages should always be returned as a single fragment
assert_eq!(fragments, vec![(0, range_start..range_end)]);
}
// Invariant: fragments must be ordered and non-overlapping
let mut last: Option<Range<Key>> = None;
for frag in &fragments {
if let Some(last) = last {
assert!(frag.1.start >= last.end);
assert!(frag.1.start > last.start);
}
last = Some(frag.1.clone())
}
// Invariant: fragments respect target_nblocks
for frag in &fragments {
assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
}
(page_count, fragments)
}
/// Really simple tests for fragment(), on a range that just contains a single stripe
/// for a single tenant.
#[test]
fn sharded_range_fragment_simple() {
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
// A range which we happen to know covers exactly one stripe which belongs to this shard
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
// Ask for stripe_size blocks, we get the whole stripe
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 32768),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for more, we still get the whole stripe
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 10000000),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for target_nblocks of half the stripe size, we get two halves
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 16384),
(
32768,
vec![
(16384, input_start..input_start.add(16384)),
(16384, input_start.add(16384)..input_end)
]
)
);
}
#[test]
fn sharded_range_fragment_multi_stripe() {
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
// Ask for all the blocks, get a fragment that covers the whole range but reports
// its size to be just the blocks belonging to our shard.
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 131072),
(32768, vec![(32768, input_start..input_end)])
);
// Ask for a sub-stripe quantity
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 16000),
(
32768,
vec![
(16000, input_start..input_start.add(16000)),
(16000, input_start.add(16000)..input_start.add(32000)),
(768, input_start.add(32000)..input_end),
]
)
);
// Try on a range that starts slightly after our owned stripe
assert_eq!(
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
(32767, vec![(32767, input_start.add(1)..input_end)])
);
}
/// Test our calculations work correctly when we start a range from the logical size key of
/// a previous relation.
#[test]
fn sharded_range_fragment_starting_from_logical_size() {
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x10000),
(0x8001, vec![(0x8001, input_start..input_end)])
);
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
// store all logical sizes)
let shard_identity = ShardIdentity::new(
ShardNumber(1),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x10000),
(0x1, vec![(0x1, input_start..input_end)])
);
}
/// Test that ShardedRange behaves properly when used on un-sharded data
#[test]
fn sharded_range_fragment_unsharded() {
let shard_identity = ShardIdentity::unsharded();
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x8000),
(
0x10000,
vec![
(0x8000, input_start..input_start.add(0x8000)),
(0x8000, input_start.add(0x8000)..input_start.add(0x10000))
]
)
);
}
#[test]
fn sharded_range_fragment_cross_relation() {
let shard_identity = ShardIdentity::unsharded();
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x8000),
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
);
// Same, but using a sharded identity
let shard_identity = ShardIdentity::new(
ShardNumber(0),
ShardCount::new(4),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 0x8000),
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
);
}
#[test]
fn sharded_range_fragment_tiny_nblocks() {
let shard_identity = ShardIdentity::unsharded();
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
assert_eq!(
do_fragment(input_start, input_end, &shard_identity, 16),
(
0x38,
vec![
(16, input_start..input_start.add(16)),
(16, input_start.add(16)..input_start.add(32)),
(16, input_start.add(32)..input_start.add(48)),
(8, input_start.add(48)..input_end),
]
)
);
}
#[test]
fn sharded_range_fragment_fuzz() {
// Use a fixed seed: we don't want to explicitly pick values, but we do want
// the test to be reproducible.
let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
for _i in 0..1000 {
let shard_identity = if prng.next_u32() % 2 == 0 {
ShardIdentity::unsharded()
} else {
let shard_count = prng.next_u32() % 127 + 1;
ShardIdentity::new(
ShardNumber((prng.next_u32() % shard_count) as u8),
ShardCount::new(shard_count as u8),
ShardParameters::DEFAULT_STRIPE_SIZE,
)
.unwrap()
};
let target_nblocks = prng.next_u32() % 65536 + 1;
let start_offset = prng.next_u32() % 16384;
// Try ranges up to 4GiB in size, that are always at least 1
let range_size = prng.next_u32() % 8192 + 1;
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
let input_start = Key::from_hex("000000067F00000001000004E10000000000")
.unwrap()
.add(start_offset);
let input_end = input_start.add(range_size);
// This test's main success conditions are the invariants baked into do_fragment
let (_total_size, fragments) =
do_fragment(input_start, input_end, &shard_identity, target_nblocks);
// Pick a random key within the range and check it appears in the output
let example_key = input_start.add(prng.next_u32() % range_size);
// Panic on unwrap if it isn't found
let example_key_frag = fragments
.iter()
.find(|f| f.1.contains(&example_key))
.unwrap();
// Check that the fragment containing our random key has a nonzero size if
// that key is shard-local
let example_key_local = !shard_identity.is_key_disposable(&example_key);
if example_key_local {
assert!(example_key_frag.0 > 0);
}
}
}
}

View File

@@ -303,7 +303,6 @@ pub struct TenantConfig {
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_to_aux_file_v2: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -782,17 +781,6 @@ pub struct SecondaryProgress {
pub bytes_total: u64,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantScanRemoteStorageShard {
pub tenant_shard_id: TenantShardId,
pub generation: Option<u32>,
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TenantScanRemoteStorageResponse {
pub shards: Vec<TenantScanRemoteStorageShard>,
}
pub mod virtual_file {
#[derive(
Copy,
@@ -860,72 +848,39 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
}
}
// In the V2 protocol version, a GetPage request contains two LSN values:
//
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
// "get the latest version present". It's used by the primary server, which knows that no one else
// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
//
// not_modified_since: Hint to the pageserver that the client knows that the page has not been
// modified between 'not_modified_since' and the request LSN. It's always correct to set
// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
// request without waiting for 'request_lsn' to arrive.
//
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
// standby to request a page at a particular non-latest LSN, and also include the
// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
// difference in the responses between V1 and V2.
//
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
// maps the old format requests to the new format.
//
#[derive(Clone, Copy)]
pub enum PagestreamProtocolVersion {
V1,
V2,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamExistsRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamNblocksRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetPageRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
pub blkno: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamDbSizeRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub dbnode: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetSlruSegmentRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub kind: u8,
pub segno: u32,
}
@@ -972,16 +927,14 @@ pub struct TenantHistorySize {
}
impl PagestreamFeMessage {
/// Serialize a compute -> pageserver message. This is currently only used in testing
/// tools. Always uses protocol version 2.
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(req) => {
bytes.put_u8(0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -990,8 +943,8 @@ impl PagestreamFeMessage {
Self::Nblocks(req) => {
bytes.put_u8(1);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1000,8 +953,8 @@ impl PagestreamFeMessage {
Self::GetPage(req) => {
bytes.put_u8(2);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1011,15 +964,15 @@ impl PagestreamFeMessage {
Self::DbSize(req) => {
bytes.put_u8(3);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode);
}
Self::GetSlruSegment(req) => {
bytes.put_u8(4);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
@@ -1028,40 +981,18 @@ impl PagestreamFeMessage {
bytes.into()
}
pub fn parse<R: std::io::Read>(
body: &mut R,
protocol_version: PagestreamProtocolVersion,
) -> anyhow::Result<PagestreamFeMessage> {
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
// TODO these gets can fail
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.read_u8()?;
let (request_lsn, not_modified_since) = match protocol_version {
PagestreamProtocolVersion::V2 => (
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
PagestreamProtocolVersion::V1 => {
// In the old protocol, each message starts with a boolean 'latest' flag,
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
// 'not_modified_since', used in the new protocol version.
let latest = body.read_u8()? != 0;
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
if latest {
(Lsn::MAX, request_lsn) // get latest version
} else {
(request_lsn, request_lsn) // get version at specified LSN
}
}
};
// The rest of the messages are the same between V1 and V2
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1070,8 +1001,8 @@ impl PagestreamFeMessage {
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1080,8 +1011,8 @@ impl PagestreamFeMessage {
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1091,14 +1022,14 @@ impl PagestreamFeMessage {
blkno: body.read_u32::<BigEndian>()?,
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
dbnode: body.read_u32::<BigEndian>()?,
})),
4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
kind: body.read_u8()?,
segno: body.read_u32::<BigEndian>()?,
},
@@ -1226,8 +1157,8 @@ mod tests {
// Test serialization/deserialization of PagestreamFeMessage
let messages = vec![
PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1236,8 +1167,8 @@ mod tests {
},
}),
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(4),
latest: false,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1246,8 +1177,8 @@ mod tests {
},
}),
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1257,16 +1188,14 @@ mod tests {
blkno: 7,
}),
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
latest: true,
lsn: Lsn(4),
dbnode: 7,
}),
];
for msg in messages {
let bytes = msg.serialize();
let reconstructed =
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
.unwrap();
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
assert!(msg == reconstructed);
}
}

View File

@@ -1,11 +1,9 @@
use utils::lsn::Lsn;
use crate::keyspace::SparseKeySpace;
#[derive(Debug, PartialEq, Eq)]
pub struct Partitioning {
pub keys: crate::keyspace::KeySpace,
pub sparse_keys: crate::keyspace::SparseKeySpace,
pub at_lsn: Lsn,
}
@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("sparse_keys")?;
map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
#[derive(serde::Deserialize)]
struct De {
keys: KeySpace,
sparse_keys: KeySpace,
#[serde_as(as = "serde_with::DisplayFromStr")]
at_lsn: Lsn,
}
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
Ok(Self {
at_lsn: de.at_lsn,
keys: de.keys.0,
sparse_keys: SparseKeySpace(de.sparse_keys.0),
})
}
}
@@ -139,12 +133,6 @@ mod tests {
"030000000000000000000000000000000003"
]
],
"sparse_keys": [
[
"620000000000000000000000000000000000",
"620000000000000000000000000000000003"
]
],
"at_lsn": "0/2240160"
}
"#;

View File

@@ -451,7 +451,7 @@ impl ShardIdentity {
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
/// tenants. Modern single-shard tenants should not use this: they should
/// have number=0 count=1.
pub const fn unsharded() -> Self {
pub fn unsharded() -> Self {
Self {
number: ShardNumber(0),
count: ShardCount(0),
@@ -538,6 +538,24 @@ impl ShardIdentity {
}
}
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
///
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
/// as a symptom of that issue.
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
return false;
}
let mut hash = murmurhash32(key.field4);
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
// The key may be affected by issue #7454: it is an initfork and it would not
// have mapped to shard 0 until we fixed that issue.
mapped_shard != ShardNumber(0)
}
/// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split.
///

View File

@@ -21,13 +21,11 @@ use std::{
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
pin::Pin,
str::FromStr,
sync::Arc,
time::{Duration, SystemTime},
};
use anyhow::{bail, Context};
use aws_sdk_s3::types::StorageClass;
use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes;
@@ -565,7 +563,6 @@ pub struct S3Config {
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
pub concurrency_limit: NonZeroUsize,
pub max_keys_per_list_response: Option<i32>,
pub upload_storage_class: Option<StorageClass>,
}
impl Debug for S3Config {
@@ -694,18 +691,6 @@ impl RemoteStorageConfig {
endpoint,
concurrency_limit,
max_keys_per_list_response,
upload_storage_class: toml
.get("upload_storage_class")
.map(|prefix_in_bucket| -> anyhow::Result<_> {
let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
let storage_class = StorageClass::from_str(&s).expect("infallible");
#[allow(deprecated)]
if matches!(storage_class, StorageClass::Unknown(_)) {
bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
}
Ok(storage_class)
})
.transpose()?,
})
}
(_, _, _, Some(_), None) => {

View File

@@ -30,7 +30,7 @@ use aws_sdk_s3::{
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,7 +62,6 @@ pub struct S3Bucket {
bucket_name: String,
prefix_in_bucket: Option<String>,
max_keys_per_list_response: Option<i32>,
upload_storage_class: Option<StorageClass>,
concurrency_limiter: ConcurrencyLimiter,
// Per-request timeout. Accessible for tests.
pub timeout: Duration,
@@ -155,7 +154,6 @@ impl S3Bucket {
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
upload_storage_class: aws_config.upload_storage_class.clone(),
timeout,
})
}
@@ -584,7 +582,6 @@ impl RemoteStorage for S3Bucket {
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.set_metadata(metadata.map(|m| m.0))
.set_storage_class(self.upload_storage_class.clone())
.content_length(from_size_bytes.try_into()?)
.body(bytes_stream)
.send();
@@ -636,7 +633,6 @@ impl RemoteStorage for S3Bucket {
.copy_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.set_storage_class(self.upload_storage_class.clone())
.copy_source(copy_source)
.send();
@@ -894,7 +890,6 @@ impl RemoteStorage for S3Bucket {
.copy_object()
.bucket(self.bucket_name.clone())
.key(key)
.set_storage_class(self.upload_storage_class.clone())
.copy_source(&source_id)
.send();
@@ -1078,7 +1073,6 @@ mod tests {
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(5),
upload_storage_class: None,
};
let storage =
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");

View File

@@ -380,7 +380,6 @@ fn create_s3_client(
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,
upload_storage_class: None,
}),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};

View File

@@ -34,8 +34,6 @@ pub enum Generation {
/// scenarios where pageservers might otherwise issue conflicting writes to
/// remote storage
impl Generation {
pub const MAX: Self = Self::Valid(u32::MAX);
/// Create a new Generation that represents a legacy key format with
/// no generation suffix
pub fn none() -> Self {

View File

@@ -2,10 +2,11 @@
use std::cmp::{Eq, Ordering};
use std::collections::BinaryHeap;
use std::fmt::Debug;
use std::mem;
use std::sync::Mutex;
use std::time::Duration;
use tokio::sync::watch::{self, channel};
use tokio::sync::watch::{channel, Receiver, Sender};
use tokio::time::timeout;
/// An error happened while waiting for a number
@@ -34,73 +35,23 @@ pub trait MonotonicCounter<V> {
fn cnt_value(&self) -> V;
}
/// Heap of waiters, lowest numbers pop first.
struct Waiters<V>
/// Internal components of a `SeqWait`
struct SeqWaitInt<S, V>
where
S: MonotonicCounter<V>,
V: Ord,
{
heap: BinaryHeap<Waiter<V>>,
/// Number of the first waiter in the heap, or None if there are no waiters.
status_channel: watch::Sender<Option<V>>,
}
impl<V> Waiters<V>
where
V: Ord + Copy,
{
fn new() -> Self {
Waiters {
heap: BinaryHeap::new(),
status_channel: channel(None).0,
}
}
/// `status_channel` contains the number of the first waiter in the heap.
/// This function should be called whenever waiters heap changes.
fn update_status(&self) {
let first_waiter = self.heap.peek().map(|w| w.wake_num);
let _ = self.status_channel.send_replace(first_waiter);
}
/// Add new waiter to the heap, return a channel that will be notified when the number arrives.
fn add(&mut self, num: V) -> watch::Receiver<()> {
let (tx, rx) = channel(());
self.heap.push(Waiter {
wake_num: num,
wake_channel: tx,
});
self.update_status();
rx
}
/// Pop all waiters <= num from the heap. Collect channels in a vector,
/// so that caller can wake them up.
fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
let mut wake_these = Vec::new();
while let Some(n) = self.heap.peek() {
if n.wake_num > num {
break;
}
wake_these.push(self.heap.pop().unwrap().wake_channel);
}
self.update_status();
wake_these
}
/// Used on shutdown to efficiently drop all waiters.
fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
let heap = mem::take(&mut self.heap);
self.update_status();
heap
}
waiters: BinaryHeap<Waiter<V>>,
current: S,
shutdown: bool,
}
struct Waiter<T>
where
T: Ord,
{
wake_num: T, // wake me when this number arrives ...
wake_channel: watch::Sender<()>, // ... by sending a message to this channel
wake_num: T, // wake me when this number arrives ...
wake_channel: Sender<()>, // ... by sending a message to this channel
}
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -125,17 +76,6 @@ impl<T: Ord> PartialEq for Waiter<T> {
impl<T: Ord> Eq for Waiter<T> {}
/// Internal components of a `SeqWait`
struct SeqWaitInt<S, V>
where
S: MonotonicCounter<V>,
V: Ord,
{
waiters: Waiters<V>,
current: S,
shutdown: bool,
}
/// A tool for waiting on a sequence number
///
/// This provides a way to wait the arrival of a number.
@@ -168,7 +108,7 @@ where
/// Create a new `SeqWait`, initialized to a particular number
pub fn new(starting_num: S) -> Self {
let internal = SeqWaitInt {
waiters: Waiters::new(),
waiters: BinaryHeap::new(),
current: starting_num,
shutdown: false,
};
@@ -188,8 +128,9 @@ where
// Block any future waiters from starting
internal.shutdown = true;
// Take all waiters to drop them later.
internal.waiters.take_all()
// This will steal the entire waiters map.
// When we drop it all waiters will be woken.
mem::take(&mut internal.waiters)
// Drop the lock as we exit this scope.
};
@@ -255,7 +196,7 @@ where
/// Register and return a channel that will be notified when a number arrives,
/// or None, if it has already arrived.
fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
let mut internal = self.internal.lock().unwrap();
if internal.current.cnt_value() >= num {
return Ok(None);
@@ -264,8 +205,12 @@ where
return Err(SeqWaitError::Shutdown);
}
// Add waiter channel to the queue.
let rx = internal.waiters.add(num);
// Create a new channel.
let (tx, rx) = channel(());
internal.waiters.push(Waiter {
wake_num: num,
wake_channel: tx,
});
// Drop the lock as we exit this scope.
Ok(Some(rx))
}
@@ -286,8 +231,16 @@ where
}
internal.current.cnt_advance(num);
// Pop all waiters <= num from the heap.
internal.waiters.pop_leq(num)
// Pop all waiters <= num from the heap. Collect them in a vector, and
// wake them up after releasing the lock.
let mut wake_these = Vec::new();
while let Some(n) = internal.waiters.peek() {
if n.wake_num > num {
break;
}
wake_these.push(internal.waiters.pop().unwrap().wake_channel);
}
wake_these
};
for tx in wake_these {
@@ -302,23 +255,6 @@ where
pub fn load(&self) -> S {
self.internal.lock().unwrap().current
}
/// Get a Receiver for the current status.
///
/// The current status is the number of the first waiter in the queue,
/// or None if there are no waiters.
///
/// This receiver will be notified whenever the status changes.
/// It is useful for receiving notifications when the first waiter
/// starts waiting for a number, or when there are no more waiters left.
pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
self.internal
.lock()
.unwrap()
.waiters
.status_channel
.subscribe()
}
}
#[cfg(test)]

View File

@@ -243,19 +243,6 @@ impl Client {
Ok(())
}
pub async fn tenant_scan_remote_storage(
&self,
tenant_id: TenantId,
) -> Result<TenantScanRemoteStorageResponse> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/scan_remote_storage",
self.mgmt_api_endpoint
);
let response = self.request(Method::GET, &uri, ()).await?;
let body = response.json().await.map_err(Error::ReceiveBody)?;
Ok(body)
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;

View File

@@ -60,7 +60,7 @@ impl Client {
) -> anyhow::Result<PagestreamClient> {
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
.client
.copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
.copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
.await?;
let Client {
cancel_on_client_drop,

View File

@@ -18,7 +18,6 @@
//! database size. For example, if the logical database size is 10 GB, we would
//! generate new image layers every 10 GB of WAL.
use futures::StreamExt;
use pageserver_api::shard::ShardIdentity;
use tracing::{debug, info};
use std::collections::{HashSet, VecDeque};
@@ -126,7 +125,6 @@ async fn compact_level<E: CompactionJobExecutor>(
}
let mut state = LevelCompactionState {
shard_identity: *executor.get_shard_identity(),
target_file_size,
_lsn_range: lsn_range.clone(),
layers: layer_fragments,
@@ -166,8 +164,6 @@ struct LevelCompactionState<'a, E>
where
E: CompactionJobExecutor,
{
shard_identity: ShardIdentity,
// parameters
target_file_size: u64,
@@ -370,7 +366,6 @@ where
.executor
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
.await?,
&self.shard_identity,
) * 8192;
let wal_size = job
@@ -435,7 +430,7 @@ where
keyspace,
self.target_file_size / 8192,
);
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
while let Some(key_range) = window.choose_next_image() {
new_jobs.push(CompactionJob::<E> {
key_range,
lsn_range: job.lsn_range.clone(),
@@ -628,12 +623,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
}
// Advance the cursor until it reaches 'target_keysize'.
fn advance_until_size(
&mut self,
w: &KeyspaceWindowHead<K>,
max_size: u64,
shard_identity: &ShardIdentity,
) {
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
while self.accum_keysize < max_size && !self.reached_end(w) {
let curr_range = &w.keyspace[self.keyspace_idx];
if self.end_key < curr_range.start {
@@ -642,7 +632,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
}
// We're now within 'curr_range'. Can we advance past it completely?
let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
let distance = K::key_range_size(&(self.end_key..curr_range.end));
if (self.accum_keysize + distance as u64) < max_size {
// oh yeah, it fits
self.end_key = curr_range.end;
@@ -651,7 +641,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
} else {
// advance within the range
let skip_key = self.end_key.skip_some();
let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
let distance = K::key_range_size(&(self.end_key..skip_key));
if (self.accum_keysize + distance as u64) < max_size {
self.end_key = skip_key;
self.accum_keysize += distance as u64;
@@ -687,7 +677,7 @@ where
}
}
fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
fn choose_next_image(&mut self) -> Option<Range<K>> {
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
// we've reached the end
return None;
@@ -697,7 +687,6 @@ where
next_pos.advance_until_size(
&self.head,
self.start_pos.accum_keysize + self.head.target_keysize,
shard_identity,
);
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
@@ -706,7 +695,6 @@ where
end_pos.advance_until_size(
&self.head,
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
shard_identity,
);
if end_pos.reached_end(&self.head) {
// gobble up any unused keyspace between the last used key and end of the range

View File

@@ -5,7 +5,6 @@ use crate::interface::*;
use futures::future::BoxFuture;
use futures::{Stream, StreamExt};
use itertools::Itertools;
use pageserver_api::shard::ShardIdentity;
use pin_project_lite::pin_project;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
@@ -14,17 +13,11 @@ use std::ops::{DerefMut, Range};
use std::pin::Pin;
use std::task::{ready, Poll};
pub fn keyspace_total_size<K>(
keyspace: &CompactionKeySpace<K>,
shard_identity: &ShardIdentity,
) -> u64
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
where
K: CompactionKey,
{
keyspace
.iter()
.map(|r| K::key_range_size(r, shard_identity) as u64)
.sum()
keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
}
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {

View File

@@ -4,7 +4,7 @@
//! All the heavy lifting is done by the create_image and create_delta
//! functions that the implementor provides.
use futures::Future;
use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
use pageserver_api::{key::Key, keyspace::key_range_size};
use std::ops::Range;
use utils::lsn::Lsn;
@@ -32,8 +32,6 @@ pub trait CompactionJobExecutor {
// Functions that the planner uses to support its decisions
// ----
fn get_shard_identity(&self) -> &ShardIdentity;
/// Return all layers that overlap the given bounding box.
fn get_layers(
&mut self,
@@ -100,7 +98,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
///
/// This returns u32, for compatibility with Repository::key. If the
/// distance is larger, return u32::MAX.
fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
fn key_range_size(key_range: &Range<Self>) -> u32;
// return "self + 1"
fn next(&self) -> Self;
@@ -115,8 +113,8 @@ impl CompactionKey for Key {
const MIN: Self = Self::MIN;
const MAX: Self = Self::MAX;
fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
ShardedRange::new(r.clone(), shard_identity).page_count()
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
key_range_size(r)
}
fn next(&self) -> Key {
(self as &Key).next()

View File

@@ -3,7 +3,6 @@ mod draw;
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
use futures::StreamExt;
use pageserver_api::shard::ShardIdentity;
use rand::Rng;
use tracing::info;
@@ -72,7 +71,7 @@ impl interface::CompactionKey for Key {
const MIN: Self = u64::MIN;
const MAX: Self = u64::MAX;
fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
fn key_range_size(key_range: &Range<Self>) -> u32 {
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
}
@@ -435,11 +434,6 @@ impl interface::CompactionJobExecutor for MockTimeline {
type ImageLayer = Arc<MockImageLayer>;
type RequestContext = MockRequestContext;
fn get_shard_identity(&self) -> &ShardIdentity {
static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
&IDENTITY
}
async fn get_layers(
&mut self,
key_range: &Range<Self::Key>,

View File

@@ -312,12 +312,8 @@ async fn main_impl(
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since: r.timeline_lsn,
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}

View File

@@ -85,27 +85,27 @@ mod tests {
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
// of the page server.
assert_eq!(
"6200000101E5B20C5F8DD5AA3289D6D9EAFA",
"8200000101E5B20C5F8DD5AA3289D6D9EAFA",
encode_aux_file_key("pg_logical/mappings/test1").to_string()
);
assert_eq!(
"620000010239AAC544893139B26F501B97E6",
"820000010239AAC544893139B26F501B97E6",
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
);
assert_eq!(
"620000010300000000000000000000000000",
"820000010300000000000000000000000000",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
);
assert_eq!(
"62000001FF8635AF2134B7266EC5B4189FD6",
"82000001FF8635AF2134B7266EC5B4189FD6",
encode_aux_file_key("pg_logical/unsupported").to_string()
);
assert_eq!(
"6200000201772D0E5D71DE14DA86142A1619",
"8200000201772D0E5D71DE14DA86142A1619",
encode_aux_file_key("pg_replslot/test3").to_string()
);
assert_eq!(
"620000FFFF1866EBEB53B807B26A2416F317",
"820000FFFF1866EBEB53B807B26A2416F317",
encode_aux_file_key("other_file_not_supported").to_string()
);
}

View File

@@ -10,10 +10,10 @@
//! This module is responsible for creation of such tarball
//! from data stored in object storage.
//!
use anyhow::{anyhow, Context};
use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::{key_to_slru_block, Key};
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
use postgres_ffi::pg_constants;
use std::fmt::Write as FmtWrite;
use std::time::SystemTime;
@@ -38,14 +38,6 @@ use postgres_ffi::PG_TLI;
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
use utils::lsn::Lsn;
#[derive(Debug, thiserror::Error)]
pub enum BasebackupError {
#[error("basebackup pageserver error {0:#}")]
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#}")]
Client(#[source] io::Error),
}
/// Create basebackup with non-rel data in it.
/// Only include relational data if 'full_backup' is true.
///
@@ -61,7 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
prev_lsn: Option<Lsn>,
full_backup: bool,
ctx: &'a RequestContext,
) -> Result<(), BasebackupError>
) -> anyhow::Result<()>
where
W: AsyncWrite + Send + Sync + Unpin,
{
@@ -100,10 +92,8 @@ where
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
return Err(BasebackupError::Server(anyhow!(
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
)));
if backup_prev != Lsn(0) {
ensure!(backup_prev == provided_prev_lsn);
}
provided_prev_lsn
} else {
@@ -169,26 +159,15 @@ where
}
}
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
let (kind, segno, _) = key_to_slru_block(*key)?;
match kind {
SlruKind::Clog => {
if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
return Err(BasebackupError::Server(anyhow!(
"invalid SlruKind::Clog record: block.len()={}",
block.len()
)));
}
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
}
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
if block.len() != BLCKSZ as usize {
return Err(BasebackupError::Server(anyhow!(
"invalid {:?} record: block.len()={}",
kind,
block.len()
)));
}
ensure!(block.len() == BLCKSZ as usize);
}
}
@@ -215,15 +194,12 @@ where
Ok(())
}
async fn flush(&mut self) -> Result<(), BasebackupError> {
async fn flush(&mut self) -> anyhow::Result<()> {
let nblocks = self.buf.len() / BLCKSZ as usize;
let (kind, segno) = self.current_segment.take().unwrap();
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
let header = new_tar_header(&segname, self.buf.len() as u64)?;
self.ar
.append(&header, self.buf.as_slice())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, self.buf.as_slice()).await?;
self.total_blocks += nblocks;
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -233,7 +209,7 @@ where
Ok(())
}
async fn finish(mut self) -> Result<(), BasebackupError> {
async fn finish(mut self) -> anyhow::Result<()> {
let res = if self.current_segment.is_none() || self.buf.is_empty() {
Ok(())
} else {
@@ -250,7 +226,7 @@ impl<'a, W> Basebackup<'a, W>
where
W: AsyncWrite + Send + Sync + Unpin,
{
async fn send_tarball(mut self) -> Result<(), BasebackupError> {
async fn send_tarball(mut self) -> anyhow::Result<()> {
// TODO include checksum
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -286,25 +262,16 @@ where
let slru_partitions = self
.timeline
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.partition(
self.timeline.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
);
.await?
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
for part in slru_partitions.parts {
let blocks = self
.timeline
.get_vectored(part, self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
for (key, block) in blocks {
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
slru_builder.add_block(&key, block).await?;
slru_builder.add_block(&key, block?).await?;
}
}
slru_builder.finish().await?;
@@ -312,11 +279,8 @@ where
let mut min_restart_lsn: Lsn = Lsn::MAX;
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in self
.timeline
.list_dbdirs(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
for ((spcnode, dbnode), has_relmap_file) in
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
@@ -325,8 +289,7 @@ where
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
// contents of UNLOGGED relations. Postgres copies it in
@@ -334,7 +297,20 @@ where
if rel.forknum == INIT_FORKNUM {
// I doubt we need _init fork itself, but having it at least
// serves as a marker relation is unlogged.
self.add_rel(rel, rel).await?;
if let Err(_e) = self.add_rel(rel, rel).await {
if self
.timeline
.get_shard_identity()
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
{
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
// recreate.
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
continue;
}
};
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
continue;
}
@@ -349,12 +325,7 @@ where
}
}
for (path, content) in self
.timeline
.list_aux_files(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
{
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
if path.starts_with("pg_replslot") {
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
let restart_lsn = Lsn(u64::from_le_bytes(
@@ -385,41 +356,34 @@ where
for xid in self
.timeline
.list_twophase_files(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
{
self.add_twophase_file(xid).await?;
}
fail_point!("basebackup-before-control-file", |_| {
Err(BasebackupError::Server(anyhow!(
"failpoint basebackup-before-control-file"
)))
bail!("failpoint basebackup-before-control-file")
});
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file().await?;
self.ar.finish().await.map_err(BasebackupError::Client)?;
self.ar.finish().await?;
debug!("all tarred up!");
Ok(())
}
/// Add contents of relfilenode `src`, naming it as `dst`.
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
.await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
let file_name = dst.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar
.append(&header, &mut io::empty())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &mut io::empty()).await?;
return Ok(());
}
@@ -433,18 +397,14 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
.await?;
segment_data.extend_from_slice(&img[..]);
}
let file_name = dst.to_segfile_name(seg as u32);
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
self.ar
.append(&header, segment_data.as_slice())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, segment_data.as_slice()).await?;
seg += 1;
startblk = endblk;
@@ -464,22 +424,20 @@ where
spcnode: u32,
dbnode: u32,
has_relmap_file: bool,
) -> Result<(), BasebackupError> {
) -> anyhow::Result<()> {
let relmap_img = if has_relmap_file {
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
if img.len()
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
{
return Err(BasebackupError::Server(anyhow!(
"img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
img.len(),
)));
}
ensure!(
img.len()
== dispatch_pgversion!(
self.timeline.pg_version,
pgv::bindings::SIZEOF_RELMAPFILE
)
);
Some(img)
} else {
@@ -492,20 +450,14 @@ where
ver => format!("{ver}\x0A"),
};
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
info!("timeline.pg_version {}", self.timeline.pg_version);
if let Some(img) = relmap_img {
// filenode map for global tablespace
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
self.ar
.append(&header, &img[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &img[..]).await?;
} else {
warn!("global/pg_filenode.map is missing");
}
@@ -524,26 +476,18 @@ where
&& self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
.is_empty()
{
return Ok(());
}
// User defined tablespaces are not supported
if spcnode != DEFAULTTABLESPACE_OID {
return Err(BasebackupError::Server(anyhow!(
"spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
)));
}
ensure!(spcnode == DEFAULTTABLESPACE_OID);
// Append dir path for each database
let path = format!("base/{}", dbnode);
let header = new_tar_header_dir(&path)?;
self.ar
.append(&header, &mut io::empty())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &mut io::empty()).await?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -553,17 +497,11 @@ where
ver => format!("{ver}\x0A"),
};
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar
.append(&header, &img[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &img[..]).await?;
}
};
Ok(())
@@ -572,12 +510,11 @@ where
//
// Extract twophase state files
//
async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = self
.timeline
.get_twophase_file(xid, self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
@@ -585,10 +522,7 @@ where
buf.put_u32_le(crc);
let path = format!("pg_twophase/{:>08X}", xid);
let header = new_tar_header(&path, buf.len() as u64)?;
self.ar
.append(&header, &buf[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &buf[..]).await?;
Ok(())
}
@@ -597,28 +531,24 @@ where
// Add generated pg_control file and bootstrap WAL segment.
// Also send zenith.signal file with extra bootstrap data.
//
async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
if self.lsn == self.timeline.get_ancestor_lsn() {
write!(zenith_signal, "PREV LSN: none")
.map_err(|e| BasebackupError::Server(e.into()))?;
write!(zenith_signal, "PREV LSN: none")?;
} else {
write!(zenith_signal, "PREV LSN: invalid")
.map_err(|e| BasebackupError::Server(e.into()))?;
write!(zenith_signal, "PREV LSN: invalid")?;
}
} else {
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
.map_err(|e| BasebackupError::Server(e.into()))?;
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
}
self.ar
.append(
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
zenith_signal.as_bytes(),
)
.await
.map_err(BasebackupError::Client)?;
.await?;
let checkpoint_bytes = self
.timeline
@@ -640,10 +570,7 @@ where
//send pg_control
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
self.ar
.append(&header, &pg_control_bytes[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &pg_control_bytes[..]).await?;
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -658,16 +585,8 @@ where
self.lsn,
)
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
if wal_seg.len() != WAL_SEGMENT_SIZE {
return Err(BasebackupError::Server(anyhow!(
"wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
wal_seg.len()
)));
}
self.ar
.append(&header, &wal_seg[..])
.await
.map_err(BasebackupError::Client)?;
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
self.ar.append(&header, &wal_seg[..]).await?;
Ok(())
}
}

View File

@@ -121,10 +121,8 @@ fn main() -> anyhow::Result<()> {
&[("node_id", &conf.id.to_string())],
);
// after setting up logging, log the effective IO engine choice and read path implementations
// after setting up logging, log the effective IO engine choice
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
info!(?conf.get_impl, "starting with get page implementation");
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {

View File

@@ -30,9 +30,9 @@ use utils::{
logging::LogFormat,
};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::timeline::GetVectoredImpl;
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
@@ -91,8 +91,6 @@ pub mod defaults {
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
pub const DEFAULT_GET_IMPL: &str = "legacy";
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
@@ -140,8 +138,6 @@ pub mod defaults {
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
#get_impl = '{DEFAULT_GET_IMPL}'
#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
@@ -288,8 +284,6 @@ pub struct PageServerConf {
pub get_vectored_impl: GetVectoredImpl,
pub get_impl: GetImpl,
pub max_vectored_read_bytes: MaxVectoredReadBytes,
pub validate_vectored_get: bool,
@@ -420,8 +414,6 @@ struct PageServerConfigBuilder {
get_vectored_impl: BuilderValue<GetVectoredImpl>,
get_impl: BuilderValue<GetImpl>,
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
validate_vectored_get: BuilderValue<bool>,
@@ -511,7 +503,6 @@ impl PageServerConfigBuilder {
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
@@ -690,10 +681,6 @@ impl PageServerConfigBuilder {
self.get_vectored_impl = BuilderValue::Set(value);
}
pub fn get_impl(&mut self, value: GetImpl) {
self.get_impl = BuilderValue::Set(value);
}
pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
self.max_vectored_read_bytes = BuilderValue::Set(value);
}
@@ -763,7 +750,6 @@ impl PageServerConfigBuilder {
secondary_download_concurrency,
ingest_batch_size,
get_vectored_impl,
get_impl,
max_vectored_read_bytes,
validate_vectored_get,
ephemeral_bytes_per_memory_kb,
@@ -1049,9 +1035,6 @@ impl PageServerConf {
"get_vectored_impl" => {
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
}
"get_impl" => {
builder.get_impl(parse_toml_from_str("get_impl", item)?)
}
"max_vectored_read_bytes" => {
let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
builder.get_max_vectored_read_bytes(
@@ -1143,7 +1126,6 @@ impl PageServerConf {
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
max_vectored_read_bytes: MaxVectoredReadBytes(
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant"),
@@ -1383,7 +1365,6 @@ background_task_maximum_delay = '334 s'
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
max_vectored_read_bytes: MaxVectoredReadBytes(
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant")
@@ -1457,7 +1438,6 @@ background_task_maximum_delay = '334 s'
ingest_batch_size: 100,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
max_vectored_read_bytes: MaxVectoredReadBytes(
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant")
@@ -1577,7 +1557,6 @@ broker_endpoint = '{broker_endpoint}'
endpoint: Some(endpoint.clone()),
concurrency_limit: s3_concurrency_limit,
max_keys_per_list_response: None,
upload_storage_class: None,
}),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
},

View File

@@ -19,8 +19,6 @@ use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigResponse;
use pageserver_api::models::TenantScanRemoteStorageResponse;
use pageserver_api::models::TenantScanRemoteStorageShard;
use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
@@ -31,7 +29,6 @@ use pageserver_api::models::{
};
use pageserver_api::shard::ShardCount;
use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
use remote_storage::GenericRemoteStorage;
use remote_storage::TimeTravelError;
use tenant_size_model::{SizeResult, StorageModel};
@@ -57,9 +54,6 @@ use crate::tenant::mgr::{
};
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
use crate::tenant::remote_timeline_client;
use crate::tenant::remote_timeline_client::download_index_part;
use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
use crate::tenant::remote_timeline_client::list_remote_timelines;
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -1918,14 +1912,12 @@ async fn timeline_collect_keyspace(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let (dense_ks, sparse_ks) = timeline
let keys = timeline
.collect_keyspace(at_lsn, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
// This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
// Therefore, we split dense/sparse keys in this API.
let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
json_response(StatusCode::OK, res)
}
@@ -2043,79 +2035,6 @@ async fn secondary_upload_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_scan_remote_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let Some(remote_storage) = state.remote_storage.as_ref() else {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Remote storage not configured"
)));
};
let mut response = TenantScanRemoteStorageResponse::default();
let (shards, _other_keys) =
list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
for tenant_shard_id in shards {
let (timeline_ids, _other_keys) =
list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
let mut generation = Generation::none();
for timeline_id in timeline_ids {
match download_index_part(
remote_storage,
&tenant_shard_id,
&timeline_id,
Generation::MAX,
&cancel,
)
.instrument(info_span!("download_index_part",
tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
%timeline_id))
.await
{
Ok((index_part, index_generation)) => {
tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
generation = std::cmp::max(generation, index_generation);
}
Err(DownloadError::NotFound) => {
// This is normal for tenants that were created with multiple shards: they have an unsharded path
// containing the timeline's initdb tarball but no index. Otherwise it is a bit strange.
tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
continue;
}
Err(e) => {
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
}
};
}
response.shards.push(TenantScanRemoteStorageShard {
tenant_shard_id,
generation: generation.into(),
});
}
if response.shards.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(),
));
}
json_response(StatusCode::OK, response)
}
async fn secondary_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -2512,9 +2431,6 @@ pub fn make_router(
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
api_handler(r, secondary_upload_handler)
})
.get("/v1/tenant/:tenant_id/scan_remote_storage", |r| {
api_handler(r, tenant_scan_remote_handler)
})
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})

View File

@@ -51,9 +51,6 @@ pub(crate) enum StorageTimeOperation {
#[strum(serialize = "gc")]
Gc,
#[strum(serialize = "update gc info")]
UpdateGcInfo,
#[strum(serialize = "create tenant")]
CreateTenant,
}
@@ -108,39 +105,31 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
});
// Metrics collected on operations on the storage repository.
#[derive(
Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
)]
pub(crate) enum GetKind {
Singular,
Vectored,
}
pub(crate) struct ReconstructTimeMetrics {
singular: Histogram,
vectored: Histogram,
ok: Histogram,
err: Histogram,
}
pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
let inner = register_histogram_vec!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value (reconstruct a page from deltas)",
&["get_kind"],
&["result"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric");
ReconstructTimeMetrics {
singular: inner.with_label_values(&[GetKind::Singular.into()]),
vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
err: inner.get_metric_with_label_values(&["err"]).unwrap(),
}
});
impl ReconstructTimeMetrics {
pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
match get_kind {
GetKind::Singular => &self.singular,
GetKind::Vectored => &self.vectored,
pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
match result {
Ok(_) => &self.ok,
Err(_) => &self.err,
}
}
}
@@ -153,33 +142,13 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) struct ReconstructDataTimeMetrics {
singular: Histogram,
vectored: Histogram,
}
impl ReconstructDataTimeMetrics {
pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
match get_kind {
GetKind::Singular => &self.singular,
GetKind::Vectored => &self.vectored,
}
}
}
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
let inner = register_histogram_vec!(
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_getpage_get_reconstruct_data_seconds",
"Time spent in get_reconstruct_value_data",
&["get_kind"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric");
ReconstructDataTimeMetrics {
singular: inner.with_label_values(&[GetKind::Singular.into()]),
vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
}
.expect("failed to define a metric")
});
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
@@ -1522,6 +1491,35 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
}
});
pub(crate) struct WalIngestMetrics {
pub(crate) bytes_received: IntCounter,
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
"Bytes of WAL ingested from safekeepers",
)
.unwrap(),
records_received: register_int_counter!(
"pageserver_wal_ingest_records_received",
"Number of WAL records received from safekeepers"
)
.expect("failed to define a metric"),
records_committed: register_int_counter!(
"pageserver_wal_ingest_records_committed",
"Number of WAL records which resulted in writes to pageserver storage"
)
.expect("failed to define a metric"),
records_filtered: register_int_counter!(
"pageserver_wal_ingest_records_filtered",
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
});
pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) upload_heatmap_errors: IntCounter,
@@ -1723,43 +1721,6 @@ macro_rules! redo_bytes_histogram_count_buckets {
};
}
pub(crate) struct WalIngestMetrics {
pub(crate) bytes_received: IntCounter,
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) time_spent_on_ingest: Histogram,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
"Bytes of WAL ingested from safekeepers",
)
.unwrap(),
records_received: register_int_counter!(
"pageserver_wal_ingest_records_received",
"Number of WAL records received from safekeepers"
)
.expect("failed to define a metric"),
records_committed: register_int_counter!(
"pageserver_wal_ingest_records_committed",
"Number of WAL records which resulted in writes to pageserver storage"
)
.expect("failed to define a metric"),
records_filtered: register_int_counter!(
"pageserver_wal_ingest_records_filtered",
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
time_spent_on_ingest: register_histogram!(
"pageserver_wal_ingest_put_value_seconds",
"Actual time spent on ingesting a record",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric"),
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_seconds",
@@ -1913,22 +1874,6 @@ impl StorageTimeMetricsTimer {
self.metrics.timeline_count.inc();
self.metrics.global_histogram.observe(duration);
}
/// Turns this timer into a timer, which will always record -- usually this means recording
/// regardless an early `?` path was taken in a function.
pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
AlwaysRecordingStorageTimeMetricsTimer(Some(self))
}
}
pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
fn drop(&mut self) {
if let Some(inner) = self.0.take() {
inner.stop_and_record();
}
}
}
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
@@ -1989,7 +1934,6 @@ pub(crate) struct TimelineMetrics {
pub imitate_logical_size_histo: StorageTimeMetrics,
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub update_gc_info_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
@@ -2050,12 +1994,6 @@ impl TimelineMetrics {
&shard_id,
&timeline_id,
);
let update_gc_info_histo = StorageTimeMetrics::new(
StorageTimeOperation::UpdateGcInfo,
&tenant_id,
&shard_id,
&timeline_id,
);
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2098,7 +2036,6 @@ impl TimelineMetrics {
logical_size_histo,
imitate_logical_size_histo,
garbage_collect_histo,
update_gc_info_histo,
load_layer_map_histo,
last_record_gauge,
resident_physical_size_gauge,

View File

@@ -1,5 +1,13 @@
//
//! The Page Service listens for client connections and serves their GetPage@LSN
//! requests.
//
// It is possible to connect here using usual psql/pgbench/libpq. Following
// commands are supported now:
// *status* -- show actual info about this pageserver,
// *pagestream* -- enter mode where smgr and pageserver talk with their
// custom protocol.
//
use anyhow::Context;
use async_compression::tokio::write::GzipEncoder;
@@ -15,7 +23,7 @@ use pageserver_api::models::{
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
PagestreamNblocksResponse, PagestreamProtocolVersion,
PagestreamNblocksResponse,
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::ShardNumber;
@@ -48,7 +56,6 @@ use utils::{
use crate::auth::check_permission;
use crate::basebackup;
use crate::basebackup::BasebackupError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
@@ -544,7 +551,6 @@ impl PageServerHandler {
pgb: &mut PostgresBackend<IO>,
tenant_id: TenantId,
timeline_id: TimelineId,
protocol_version: PagestreamProtocolVersion,
ctx: RequestContext,
) -> Result<(), QueryError>
where
@@ -607,15 +613,14 @@ impl PageServerHandler {
t.trace(&copy_data_bytes)
}
let neon_fe_msg =
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
// TODO: We could create a new per-request context here, with unique ID.
// Currently we use the same per-timeline context for all requests
let (response, span) = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
(
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
@@ -624,7 +629,7 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::Nblocks(req) => {
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
(
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
@@ -634,7 +639,7 @@ impl PageServerHandler {
}
PagestreamFeMessage::GetPage(req) => {
// shard_id is filled in by the handler
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
(
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
@@ -643,7 +648,7 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::DbSize(req) => {
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
(
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
@@ -652,7 +657,7 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::GetSlruSegment(req) => {
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
(
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
@@ -833,80 +838,83 @@ impl PageServerHandler {
/// Helper function to handle the LSN from client request.
///
/// Each GetPage (and Exists and Nblocks) request includes information about
/// which version of the page is being requested. The primary compute node
/// will always request the latest page version, by setting 'request_lsn' to
/// the last inserted or flushed WAL position, while a standby will request
/// a version at the LSN that it's currently caught up to.
/// which version of the page is being requested. The client can request the
/// latest version of the page, or the version that's valid at a particular
/// LSN. The primary compute node will always request the latest page
/// version, while a standby will request a version at the LSN that it's
/// currently caught up to.
///
/// In either case, if the page server hasn't received the WAL up to the
/// requested LSN yet, we will wait for it to arrive. The return value is
/// the LSN that should be used to look up the page versions.
///
/// In addition to the request LSN, each request carries another LSN,
/// 'not_modified_since', which is a hint to the pageserver that the client
/// knows that the page has not been modified between 'not_modified_since'
/// and the request LSN. This allows skipping the wait, as long as the WAL
/// up to 'not_modified_since' has arrived. If the client doesn't have any
/// information about when the page was modified, it will use
/// not_modified_since == lsn. If the client lies and sends a too low
/// not_modified_hint such that there are in fact later page versions, the
/// behavior is undefined: the pageserver may return any of the page versions
/// or an error.
async fn wait_or_get_last_lsn(
timeline: &Timeline,
request_lsn: Lsn,
not_modified_since: Lsn,
mut lsn: Lsn,
latest: bool,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
ctx: &RequestContext,
) -> Result<Lsn, PageStreamError> {
let last_record_lsn = timeline.get_last_record_lsn();
if latest {
// Latest page version was requested. If LSN is given, it is a hint
// to the page server that there have been no modifications to the
// page after that LSN. If we haven't received WAL up to that point,
// wait until it arrives.
let last_record_lsn = timeline.get_last_record_lsn();
// Sanity check the request
if request_lsn < not_modified_since {
return Err(PageStreamError::BadRequest(
format!(
"invalid request with request LSN {} and not_modified_since {}",
request_lsn, not_modified_since,
)
.into(),
));
}
if request_lsn < **latest_gc_cutoff_lsn {
// Check explicitly for INVALID just to get a less scary error message if the
// request is obviously bogus
return Err(if request_lsn == Lsn::INVALID {
PageStreamError::BadRequest("invalid LSN(0) in request".into())
// Note: this covers the special case that lsn == Lsn(0). That
// special case means "return the latest version whatever it is",
// and it's used for bootstrapping purposes, when the page server is
// connected directly to the compute node. That is needed because
// when you connect to the compute node, to receive the WAL, the
// walsender process will do a look up in the pg_authid catalog
// table for authentication. That poses a deadlock problem: the
// catalog table lookup will send a GetPage request, but the GetPage
// request will block in the page server because the recent WAL
// hasn't been received yet, and it cannot be received until the
// walsender completes the authentication and starts streaming the
// WAL.
if lsn <= last_record_lsn {
// It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
// last_record_lsn. That would give the same result, since we know
// that there haven't been modifications since 'lsn'. Using an older
// LSN might be faster, because that could allow skipping recent
// layers when finding the page.
lsn = last_record_lsn;
} else {
PageStreamError::BadRequest(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
request_lsn, **latest_gc_cutoff_lsn
).into())
});
}
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
if not_modified_since > last_record_lsn {
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
// Since we waited for 'lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the
// last-record LSN can advance immediately after we return
// anyway)
}
} else {
if lsn == Lsn(0) {
return Err(PageStreamError::BadRequest(
"invalid LSN(0) in request".into(),
));
}
timeline
.wait_lsn(
not_modified_since,
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
// Since we waited for 'not_modified_since' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the last-record LSN can
// advance immediately after we return anyway)
Ok(not_modified_since)
} else {
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
// here instead. That would give the same result, since we know that there
// haven't been any modifications since 'not_modified_since'. Using an older
// LSN might be faster, because that could allow skipping recent layers when
// finding the page. However, we have historically used 'last_record_lsn', so
// stick to that for now.
Ok(std::cmp::min(last_record_lsn, request_lsn))
}
if lsn < **latest_gc_cutoff_lsn {
return Err(PageStreamError::BadRequest(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
).into()));
}
Ok(lsn)
}
#[instrument(skip_all, fields(shard_id))]
@@ -923,17 +931,12 @@ impl PageServerHandler {
.start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
.await?;
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let exists = timeline
.get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -956,17 +959,12 @@ impl PageServerHandler {
.start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
.await?;
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let n_blocks = timeline
.get_rel_size(req.rel, Version::Lsn(lsn), ctx)
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -989,17 +987,18 @@ impl PageServerHandler {
.start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
.await?;
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let total_blocks = timeline
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
.get_db_size(
DEFAULTTABLESPACE_OID,
req.dbnode,
Version::Lsn(lsn),
req.latest,
ctx,
)
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
@@ -1166,17 +1165,12 @@ impl PageServerHandler {
.start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
.await?;
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let page = timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -1199,14 +1193,9 @@ impl PageServerHandler {
.start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.request_lsn,
req.not_modified_since,
&latest_gc_cutoff_lsn,
ctx,
)
.await?;
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let kind = SlruKind::from_repr(req.kind)
.ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
@@ -1237,13 +1226,6 @@ impl PageServerHandler {
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
fn map_basebackup_error(err: BasebackupError) -> QueryError {
match err {
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Server(e) => QueryError::Other(e),
}
}
let started = std::time::Instant::now();
// check that the timeline exists
@@ -1269,8 +1251,7 @@ impl PageServerHandler {
let lsn_awaited_after = started.elapsed();
// switch client to COPYOUT
pgb.write_message_noflush(&BeMessage::CopyOutResponse)
.map_err(QueryError::Disconnected)?;
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
// Send a tarball of the latest layer on the timeline. Compress if not
@@ -1285,8 +1266,7 @@ impl PageServerHandler {
full_backup,
ctx,
)
.await
.map_err(map_basebackup_error)?;
.await?;
} else {
let mut writer = pgb.copyout_writer();
if gzip {
@@ -1307,13 +1287,9 @@ impl PageServerHandler {
full_backup,
ctx,
)
.await
.map_err(map_basebackup_error)?;
.await?;
// shutdown the encoder to ensure the gzip footer is written
encoder
.shutdown()
.await
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
encoder.shutdown().await?;
} else {
basebackup::send_basebackup_tarball(
&mut writer,
@@ -1323,13 +1299,11 @@ impl PageServerHandler {
full_backup,
ctx,
)
.await
.map_err(map_basebackup_error)?;
.await?;
}
}
pgb.write_message_noflush(&BeMessage::CopyDone)
.map_err(QueryError::Disconnected)?;
pgb.write_message_noflush(&BeMessage::CopyDone)?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
let basebackup_after = started
@@ -1439,34 +1413,7 @@ where
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
if query_string.starts_with("pagestream_v2 ") {
let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
PagestreamProtocolVersion::V2,
ctx,
)
.await?;
} else if query_string.starts_with("pagestream ") {
if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
if params.len() != 2 {
@@ -1485,14 +1432,8 @@ where
self.check_permission(Some(tenant_id))?;
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
PagestreamProtocolVersion::V1,
ctx,
)
.await?;
self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
.await?;
} else if query_string.starts_with("basebackup ") {
let (_, params_raw) = query_string.split_at("basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();

View File

@@ -9,7 +9,6 @@
use super::tenant::{PageReconstructError, Timeline};
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::WAL_INGEST;
use crate::repository::*;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
@@ -23,7 +22,6 @@ use pageserver_api::key::{
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -177,6 +175,7 @@ impl Timeline {
tag: RelTag,
blknum: BlockNumber,
version: Version<'_>,
latest: bool,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
if tag.relnode == 0 {
@@ -185,7 +184,7 @@ impl Timeline {
));
}
let nblocks = self.get_rel_size(tag, version, ctx).await?;
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -207,6 +206,7 @@ impl Timeline {
spcnode: Oid,
dbnode: Oid,
version: Version<'_>,
latest: bool,
ctx: &RequestContext,
) -> Result<usize, PageReconstructError> {
let mut total_blocks = 0;
@@ -214,7 +214,7 @@ impl Timeline {
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
for rel in rels {
let n_blocks = self.get_rel_size(rel, version, ctx).await?;
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
total_blocks += n_blocks as usize;
}
Ok(total_blocks)
@@ -225,6 +225,7 @@ impl Timeline {
&self,
tag: RelTag,
version: Version<'_>,
latest: bool,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
if tag.relnode == 0 {
@@ -238,7 +239,7 @@ impl Timeline {
}
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, version, ctx).await?
&& !self.get_rel_exists(tag, version, latest, ctx).await?
{
// FIXME: Postgres sometimes calls smgrcreate() to create
// FSM, and smgrnblocks() on it immediately afterwards,
@@ -261,6 +262,7 @@ impl Timeline {
&self,
tag: RelTag,
version: Version<'_>,
_latest: bool,
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
if tag.relnode == 0 {
@@ -456,12 +458,6 @@ impl Timeline {
// Didn't find any commit timestamps smaller than the request
Ok(LsnForTimestamp::Past(min_lsn))
}
(true, _) if commit_lsn < min_lsn => {
// the search above did set found_smaller to true but it never increased the lsn.
// Then, low is still the old min_lsn, and the subtraction above gave a value
// below the min_lsn. We should never do that.
Ok(LsnForTimestamp::Past(min_lsn))
}
(true, false) => {
// Only found commits with timestamps smaller than the request.
// It's still a valid case for branch creation, return it.
@@ -731,13 +727,11 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
///
/// The return value is (dense keyspace, sparse keyspace).
pub(crate) async fn collect_keyspace(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
) -> Result<KeySpace, CollectKeySpaceError> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -809,12 +803,7 @@ impl Timeline {
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
result.add_key(AUX_FILES_KEY);
}
Ok((
result.to_keyspace(),
/* AUX sparse key space */
SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
))
Ok(result.to_keyspace())
}
/// Get cached size of relation if it not updated after specified LSN
@@ -1100,7 +1089,7 @@ impl<'a> DatadirModification<'a> {
) -> anyhow::Result<()> {
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
.await?;
// Remove entry from dbdir
@@ -1199,7 +1188,7 @@ impl<'a> DatadirModification<'a> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
if self
.tline
.get_rel_exists(rel, Version::Modified(self), ctx)
.get_rel_exists(rel, Version::Modified(self), true, ctx)
.await?
{
let size_key = rel_size_to_key(rel);
@@ -1557,8 +1546,6 @@ impl<'a> DatadirModification<'a> {
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let mut writer = self.tline.writer().await;
let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
@@ -1598,8 +1585,6 @@ impl<'a> DatadirModification<'a> {
writer.update_directory_entries_count(kind, count as u64);
}
timer.observe_duration();
Ok(())
}

View File

@@ -361,8 +361,6 @@ pub enum TaskKind {
DebugTool,
EphemeralFilePreWarmPageCache,
#[cfg(test)]
UnitTest,
}

View File

@@ -888,7 +888,7 @@ impl Tenant {
#[instrument(skip_all)]
pub(crate) async fn preload(
self: &Arc<Self>,
self: &Arc<Tenant>,
remote_storage: &GenericRemoteStorage,
cancel: CancellationToken,
) -> anyhow::Result<TenantPreload> {
@@ -918,13 +918,9 @@ impl Tenant {
Ok(TenantPreload {
deleting,
timelines: Self::load_timeline_metadata(
self,
remote_timeline_ids,
remote_storage,
cancel,
)
.await?,
timelines: self
.load_timeline_metadata(remote_timeline_ids, remote_storage, cancel)
.await?,
})
}
@@ -3406,11 +3402,7 @@ impl Tenant {
// is in progress (which is not a common case).
//
// See more for on the issue #2748 condenced out of the initial PR review.
let mut shared_cache = tokio::select! {
locked = self.cached_logical_sizes.lock() => locked,
_ = cancel.cancelled() => anyhow::bail!("cancelled"),
_ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
};
let mut shared_cache = self.cached_logical_sizes.lock().await;
size::gather_inputs(
self,
@@ -3672,7 +3664,6 @@ pub(crate) mod harness {
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
}
}
}
@@ -3873,9 +3864,7 @@ mod tests {
use hex_literal::hex;
use pageserver_api::key::NON_INHERITED_RANGE;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::CompactionAlgorithm;
use rand::{thread_rng, Rng};
use tests::storage_layer::ValuesReconstructState;
use tests::timeline::{GetVectoredError, ShutdownMode};
static TEST_KEY: Lazy<Key> =
@@ -4513,23 +4502,11 @@ mod tests {
}
async fn bulk_insert_compact_gc(
timeline: Arc<Timeline>,
ctx: &RequestContext,
lsn: Lsn,
repeat: usize,
key_count: usize,
) -> anyhow::Result<()> {
let compact = true;
bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
}
async fn bulk_insert_maybe_compact_gc(
timeline: Arc<Timeline>,
ctx: &RequestContext,
mut lsn: Lsn,
repeat: usize,
key_count: usize,
compact: bool,
) -> anyhow::Result<()> {
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut blknum = 0;
@@ -4570,11 +4547,9 @@ mod tests {
)
.await?;
timeline.freeze_and_flush().await?;
if compact {
timeline
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
.await?;
}
timeline
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
.await?;
timeline.gc().await?;
}
@@ -4678,9 +4653,7 @@ mod tests {
for read in reads {
info!("Doing vectored read on {:?}", read);
let vectored_res = tline
.get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
.await;
let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
tline
.validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
.await;
@@ -4725,12 +4698,7 @@ mod tests {
let read_lsn = child_timeline.get_last_record_lsn();
let vectored_res = child_timeline
.get_vectored_impl(
aux_keyspace.clone(),
read_lsn,
ValuesReconstructState::new(),
&ctx,
)
.get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
.await;
child_timeline
@@ -4878,12 +4846,7 @@ mod tests {
ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
};
let results = child_timeline
.get_vectored_impl(
read.clone(),
current_lsn,
ValuesReconstructState::new(),
&ctx,
)
.get_vectored_impl(read.clone(), current_lsn, &ctx)
.await?;
for (key, img_res) in results {
@@ -5016,7 +4979,6 @@ mod tests {
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
},
query_lsn,
ValuesReconstructState::new(),
&ctx,
)
.await;
@@ -5057,22 +5019,7 @@ mod tests {
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
let names_algorithms = [
("test_random_updates_legacy", CompactionAlgorithm::Legacy),
("test_random_updates_tiered", CompactionAlgorithm::Tiered),
];
for (name, algorithm) in names_algorithms {
test_random_updates_algorithm(name, algorithm).await?;
}
Ok(())
}
async fn test_random_updates_algorithm(
name: &'static str,
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
let harness = TenantHarness::create("test_random_updates")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5137,7 +5084,7 @@ mod tests {
);
}
// Perform a cycle of flush, and GC
// Perform a cycle of flush, compact, and GC
let cutoff = tline.get_last_record_lsn();
tline
.update_gc_info(
@@ -5149,6 +5096,9 @@ mod tests {
)
.await?;
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.gc().await?;
}
@@ -5429,36 +5379,19 @@ mod tests {
#[tokio::test]
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
let names_algorithms = [
("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
];
for (name, algorithm) in names_algorithms {
test_read_at_max_lsn_algorithm(name, algorithm).await?;
}
Ok(())
}
async fn test_read_at_max_lsn_algorithm(
name: &'static str,
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
let harness = TenantHarness::create("test_read_at_max_lsn")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
let lsn = Lsn(0x10);
let compact = false;
bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let read_lsn = Lsn(u64::MAX - 1);
let result = tline.get(test_key, read_lsn, &ctx).await;
assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
Ok(())
}

View File

@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
self.offset
}
const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
/// Writes the given buffer directly to the underlying `VirtualFile`.
/// You need to make sure that the internal buffer is empty, otherwise

View File

@@ -369,10 +369,6 @@ pub struct TenantConf {
// How much WAL must be ingested before checking again whether a new image layer is required.
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
/// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
pub switch_to_aux_file_v2: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -468,10 +464,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
pub image_layer_creation_check_threshold: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub switch_to_aux_file_v2: Option<bool>,
}
impl TenantConfOpt {
@@ -529,9 +521,6 @@ impl TenantConfOpt {
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
switch_to_aux_file_v2: self
.switch_to_aux_file_v2
.unwrap_or(global_conf.switch_to_aux_file_v2),
}
}
}
@@ -573,7 +562,6 @@ impl Default for TenantConf {
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_to_aux_file_v2: false,
}
}
}
@@ -648,7 +636,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
switch_to_aux_file_v2: value.switch_to_aux_file_v2,
}
}
}

View File

@@ -3,26 +3,36 @@
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::{self, VirtualFile};
use bytes::BytesMut;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use std::cmp::min;
use std::io;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::sync::atomic::AtomicU64;
use tracing::*;
use utils::id::TimelineId;
pub struct EphemeralFile {
page_cache_file_id: page_cache::FileId,
_tenant_shard_id: TenantShardId,
_timeline_id: TimelineId,
rw: page_caching::RW,
file: VirtualFile,
len: u64,
/// An ephemeral file is append-only.
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
/// The other pages, which can no longer be modified, are accessed through the page cache.
///
/// None <=> IO is ongoing.
/// Size is fixed to PAGE_SZ at creation time and must not be changed.
mutable_tail: Option<BytesMut>,
}
mod page_caching;
mod zero_padded_read_write;
impl EphemeralFile {
pub async fn create(
conf: &PageServerConf,
@@ -49,18 +59,21 @@ impl EphemeralFile {
.await?;
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
rw: page_caching::RW::new(file),
file,
len: 0,
mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
})
}
pub(crate) fn len(&self) -> u64 {
self.rw.bytes_written()
self.len
}
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
self.rw.page_cache_file_id()
pub(crate) fn id(&self) -> page_cache::FileId {
self.page_cache_file_id
}
pub(crate) async fn read_blk(
@@ -68,30 +81,182 @@ impl EphemeralFile {
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
self.rw.read_blk(blknum, ctx).await
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum, self.file.path, e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = self
.file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));
}
};
} else {
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
Ok(BlockLease::EphemeralFileMutableTail(
self.mutable_tail
.as_deref()
.expect("we're not doing IO, it must be Some()")
.try_into()
.expect("we ensure that it's always PAGE_SZ"),
))
}
}
pub(crate) async fn write_blob(
&mut self,
srcbuf: &[u8],
_ctx: &RequestContext,
ctx: &RequestContext,
) -> Result<u64, io::Error> {
let pos = self.rw.bytes_written();
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
blknum: u32,
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
off: usize,
}
impl<'a> Writer<'a> {
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
Ok(Writer {
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
ephemeral_file,
})
}
#[inline(always)]
async fn push_bytes(
&mut self,
src: &[u8],
ctx: &RequestContext,
) -> Result<(), io::Error> {
let mut src_remaining = src;
while !src_remaining.is_empty() {
let dst_remaining = &mut self
.ephemeral_file
.mutable_tail
.as_deref_mut()
.expect("IO is not yet ongoing")[self.off..];
let n = min(dst_remaining.len(), src_remaining.len());
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
self.off += n;
src_remaining = &src_remaining[n..];
if self.off == PAGE_SZ {
let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
.expect("IO is not yet ongoing");
let (mutable_tail, res) = self
.ephemeral_file
.file
.write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
.await;
// TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
// I.e., the IO isn't retryable if we panic.
self.ephemeral_file.mutable_tail = Some(mutable_tail);
match res {
Ok(_) => {
// Pre-warm the page cache with what we just wrote.
// This isn't necessary for coherency/correctness, but it's how we've always done it.
let cache = page_cache::get();
match cache
.read_immutable_buf(
self.ephemeral_file.page_cache_file_id,
self.blknum,
ctx,
)
.await
{
Ok(page_cache::ReadBufResult::Found(_guard)) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
}
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
buf.copy_from_slice(
self.ephemeral_file
.mutable_tail
.as_deref()
.expect("IO is not ongoing"),
);
let _ = write_guard.mark_valid();
// pre-warm successful
}
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
}
// Zero the buffer for re-use.
// Zeroing is critical for correcntess because the write_blob code below
// and similarly read_blk expect zeroed pages.
self.ephemeral_file
.mutable_tail
.as_deref_mut()
.expect("IO is not ongoing")
.fill(0);
// This block is done, move to next one.
self.blknum += 1;
self.off = 0;
}
Err(e) => {
return Err(std::io::Error::new(
ErrorKind::Other,
// order error before path because path is long and error is short
format!(
"ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
self.blknum,
e,
self.ephemeral_file.file.path,
),
));
}
}
}
}
Ok(())
}
}
let pos = self.len;
let mut writer = Writer::new(self)?;
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
self.rw.write_all_borrowed(&len_buf).await?;
writer.push_bytes(&len_buf, ctx).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
self.rw.write_all_borrowed(&len_buf).await?;
writer.push_bytes(&len_buf, ctx).await?;
}
// Write the payload
self.rw.write_all_borrowed(srcbuf).await?;
writer.push_bytes(srcbuf, ctx).await?;
if srcbuf.len() < 0x80 {
self.len += 1;
} else {
self.len += 4;
}
self.len += srcbuf.len() as u64;
Ok(pos)
}
@@ -106,6 +271,28 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
}
}
impl Drop for EphemeralFile {
fn drop(&mut self) {
// There might still be pages in the [`crate::page_cache`] for this file.
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// unlink the file
let res = std::fs::remove_file(&self.file.path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.file.path, e
);
}
}
}
}
impl BlockReader for EphemeralFile {
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))

View File

@@ -1,218 +0,0 @@
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::BlockLease;
use crate::virtual_file::VirtualFile;
use once_cell::sync::Lazy;
use std::io::{self, ErrorKind};
use tokio_epoll_uring::BoundedBuf;
use tracing::*;
use super::zero_padded_read_write;
/// See module-level comment.
pub struct RW {
page_cache_file_id: page_cache::FileId,
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
}
impl RW {
pub fn new(file: VirtualFile) -> Self {
let page_cache_file_id = page_cache::next_file_id();
Self {
page_cache_file_id,
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
page_cache_file_id,
file,
)),
}
}
pub fn page_cache_file_id(&self) -> page_cache::FileId {
self.page_cache_file_id
}
pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
// because Compute is unlikely to access recently written data.
self.rw.write_all_borrowed(srcbuf).await
}
pub(crate) fn bytes_written(&self) -> u64 {
self.rw.bytes_written()
}
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
match self.rw.read_blk(blknum).await? {
zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.rw.as_writer().file.path,
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = writer
.file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));
}
}
}
zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
Ok(BlockLease::EphemeralFileMutableTail(buffer))
}
}
}
}
impl Drop for RW {
fn drop(&mut self) {
// There might still be pages in the [`crate::page_cache`] for this file.
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// unlink the file
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.rw.as_writer().file.path,
e
);
}
}
}
}
struct PreWarmingWriter {
nwritten_blocks: u32,
page_cache_file_id: page_cache::FileId,
file: VirtualFile,
}
impl PreWarmingWriter {
fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
Self {
nwritten_blocks: 0,
page_cache_file_id,
file,
}
}
}
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
async fn write_all<
B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
Buf: tokio_epoll_uring::IoBuf + Send,
>(
&mut self,
buf: B,
) -> std::io::Result<(usize, B::Buf)> {
let buf = buf.slice(..);
let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
Some(buf.to_vec())
} else {
None
};
let buflen = buf.len();
assert_eq!(
buflen % PAGE_SZ,
0,
"{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
);
// Do the IO.
let iobuf = match self.file.write_all(buf).await {
(iobuf, Ok(nwritten)) => {
assert_eq!(nwritten, buflen);
iobuf
}
(_, Err(e)) => {
return Err(std::io::Error::new(
ErrorKind::Other,
// order error before path because path is long and error is short
format!(
"ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
self.nwritten_blocks, buflen, e, self.file.path,
),
));
}
};
// Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
assert_eq!(&check_bounds_stuff_works, &*buf);
}
// Pre-warm page cache with the contents.
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
// benefits the code that writes InMemoryLayer=>L0 layers.
let nblocks = buflen / PAGE_SZ;
let nblocks32 = u32::try_from(nblocks).unwrap();
let cache = page_cache::get();
static CTX: Lazy<RequestContext> = Lazy::new(|| {
RequestContext::new(
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
crate::context::DownloadBehavior::Error,
)
});
for blknum_in_buffer in 0..nblocks {
let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
let blknum = self
.nwritten_blocks
.checked_add(blknum_in_buffer as u32)
.unwrap();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
.await
{
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
Ok(v) => match v {
page_cache::ReadBufResult::Found(_guard) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
and this function takes &mut self, so, no concurrent read_blk is possible");
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
write_guard.copy_from_slice(blk_in_buffer);
let _ = write_guard.mark_valid();
}
},
}
}
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
Ok((buflen, buf.into_inner()))
}
}

View File

@@ -1,125 +0,0 @@
//! The heart of how [`super::EphemeralFile`] does its reads and writes.
//!
//! # Writes
//!
//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
//!
//! # Reads
//!
//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
//!
//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
//! if the read is for the prefix that has already been flushed.
//!
//! # Current Usage
//!
//! The current user of this module is [`super::page_caching::RW`].
mod zero_padded;
use crate::{
page_cache::PAGE_SZ,
virtual_file::owned_buffers_io::{
self,
write::{Buffer, OwnedAsyncWriter},
},
};
const TAIL_SZ: usize = 64 * 1024;
/// See module-level comment.
pub struct RW<W: OwnedAsyncWriter> {
buffered_writer: owned_buffers_io::write::BufferedWriter<
zero_padded::Buffer<TAIL_SZ>,
owned_buffers_io::util::size_tracking_writer::Writer<W>,
>,
}
pub enum ReadResult<'a, W> {
NeedsReadFromWriter { writer: &'a W },
ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
}
impl<W> RW<W>
where
W: OwnedAsyncWriter,
{
pub fn new(writer: W) -> Self {
let bytes_flushed_tracker =
owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
bytes_flushed_tracker,
zero_padded::Buffer::default(),
);
Self { buffered_writer }
}
pub(crate) fn as_writer(&self) -> &W {
self.buffered_writer.as_inner().as_inner()
}
pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.buffered_writer.write_buffered_borrowed(buf).await
}
pub fn bytes_written(&self) -> u64 {
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
flushed_offset + u64::try_from(buffer.pending()).unwrap()
}
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
// The trailing page ("block") might only be partially filled,
// yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
// Moreover, it has to be zero-padded, because when we still had
// a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
// => check here that the read doesn't go beyond this potentially trailing
// => the zero-padding is done in the `else` branch below
let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
buffered_offset / (PAGE_SZ as u64)
} else {
(buffered_offset / (PAGE_SZ as u64)) + 1
};
if (blknum as u64) >= blocks_written {
return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
}
// assertions for the `if-else` below
assert_eq!(
flushed_offset % (TAIL_SZ as u64), 0,
"we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
);
assert_eq!(
flushed_offset % (PAGE_SZ as u64),
0,
"the logic below can't handle if the page is spread across the flushed part and the buffer"
);
if read_offset < flushed_offset {
assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
Ok(ReadResult::NeedsReadFromWriter {
writer: self.as_writer(),
})
} else {
let read_offset_in_buffer = read_offset
.checked_sub(flushed_offset)
.expect("would have taken `if` branch instead of this one");
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
let zero_padded_slice = buffer.as_zero_padded_slice();
let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
Ok(ReadResult::ServedFromZeroPaddedMutableTail {
buffer: page
.try_into()
.expect("the slice above got it as page-size slice"),
})
}
}
}

View File

@@ -1,108 +0,0 @@
//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
//! unwritten range is guaranteed to be zero-initialized.
//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
use std::mem::MaybeUninit;
/// See module-level comment.
pub struct Buffer<const N: usize> {
allocation: Box<[u8; N]>,
written: usize,
}
impl<const N: usize> Default for Buffer<N> {
fn default() -> Self {
Self {
allocation: Box::new(
// SAFETY: zeroed memory is a valid [u8; N]
unsafe { MaybeUninit::zeroed().assume_init() },
),
written: 0,
}
}
}
impl<const N: usize> Buffer<N> {
#[inline(always)]
fn invariants(&self) {
// don't check by default, unoptimized is too expensive even for debug mode
if false {
debug_assert!(self.written <= N, "{}", self.written);
debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
}
}
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
&self.allocation
}
}
impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
type IoBuf = Self;
fn cap(&self) -> usize {
self.allocation.len()
}
fn extend_from_slice(&mut self, other: &[u8]) {
self.invariants();
let remaining = self.allocation.len() - self.written;
if other.len() > remaining {
panic!("calling extend_from_slice() with insufficient remaining capacity");
}
self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
self.written += other.len();
self.invariants();
}
fn pending(&self) -> usize {
self.written
}
fn flush(self) -> tokio_epoll_uring::Slice<Self> {
self.invariants();
let written = self.written;
tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
}
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
let Self {
mut allocation,
written,
} = iobuf;
allocation[0..written].fill(0);
let new = Self {
allocation,
written: 0,
};
new.invariants();
new
}
}
/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
///
/// Remember that bytes_init is generally _not_ a tracker of the amount
/// of valid data in the io buffer; we use `Slice` for that.
/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
///
/// SAFETY:
///
/// The [`Self::allocation`] is stable becauses boxes are stable.
/// The memory is zero-initialized, so, bytes_init is always N.
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
fn stable_ptr(&self) -> *const u8 {
self.allocation.as_ptr()
}
fn bytes_init(&self) -> usize {
// Yes, N, not self.written; Read the full comment of this impl block!
N
}
fn bytes_total(&self) -> usize {
N
}
}

View File

@@ -916,7 +916,6 @@ mod tests {
assert_eq!(lhs, rhs);
}
#[cfg(test)]
fn brute_force_range_search(
layer_map: &LayerMap,
key_range: Range<Key>,

View File

@@ -2,7 +2,6 @@
//! page server.
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::LocationConfigMode;
@@ -254,15 +253,17 @@ impl TenantsMap {
}
}
/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
/// the slower actual deletion in the background.
///
/// This is "safe" in that that it won't leave behind a partially deleted directory
/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
/// the contents.
///
/// This is pageserver-specific, as it relies on future processes after a crash to check
/// for TEMP_FILE_SUFFIX when loading things.
async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
let tmp_path = safe_rename_tenant_dir(path).await?;
fs::remove_dir_all(tmp_path).await
}
async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
let parent = path
.as_ref()
@@ -285,28 +286,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
Ok(tmp_path)
}
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
/// the background, and thereby avoid blocking any API requests on this deletion completing.
fn spawn_background_purge(tmp_path: Utf8PathBuf) {
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
}
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
@@ -591,11 +570,7 @@ pub async fn init_tenant_mgr(
);
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
// Accumulate futures for writing tenant configs, so that we can execute in parallel
let mut config_write_futs = Vec::new();
// Update the location configs according to the re-attach response and persist them to disk
tracing::info!("Updating {} location configs", tenant_configs.len());
// Construct `Tenant` objects and start them running
for (tenant_shard_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -622,22 +597,18 @@ pub async fn init_tenant_mgr(
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
SecondaryLocationConfig { warm: true };
// Update the location config according to the re-attach response
if let Some(tenant_modes) = &tenant_modes {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
match tenant_modes.get(&tenant_shard_id) {
None => {
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
match safe_rename_tenant_dir(&tenant_dir_path).await {
Ok(tmp_path) => {
spawn_background_purge(tmp_path);
}
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
}
};
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
);
}
// We deleted local content: move on to next tenant, don't try and spawn this one.
continue;
@@ -683,32 +654,8 @@ pub async fn init_tenant_mgr(
// Presence of a generation number implies attachment: attach the tenant
// if it wasn't already, and apply the generation number.
config_write_futs.push(async move {
let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
(tenant_shard_id, location_conf, r)
});
}
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
// Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
tracing::info!(
"Writing {} location config files...",
config_write_futs.len()
);
let config_write_results = futures::stream::iter(config_write_futs)
.buffer_unordered(16)
.collect::<Vec<_>>()
.await;
tracing::info!(
"Spawning {} tenant shard locations...",
config_write_results.len()
);
// For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
// Errors writing configs are fatal
config_write_result?;
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
let shard_identity = location_conf.shard;
let slot = match location_conf.mode {
LocationMode::Attached(attached_conf) => {
@@ -1752,7 +1699,7 @@ impl TenantManager {
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
spawn_background_purge(tmp_path);
self.spawn_background_purge(tmp_path);
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
"failpoint"
@@ -1907,6 +1854,28 @@ impl TenantManager {
shutdown_all_tenants0(self.tenants).await
}
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
/// the background, and thereby avoid blocking any API requests on this deletion completing.
fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
}
pub(crate) async fn detach_tenant(
&self,
conf: &'static PageServerConf,
@@ -1923,7 +1892,7 @@ impl TenantManager {
deletion_queue_client,
)
.await?;
spawn_background_purge(tmp_path);
self.spawn_background_purge(tmp_path);
Ok(())
}

View File

@@ -243,9 +243,7 @@ use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
use super::upload_queue::SetDeletedFlagProgress;
use super::Generation;
pub(crate) use download::{
download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
};
pub(crate) use download::{is_temp_download_file, list_remote_timelines};
pub(crate) use index::LayerFileMetadata;
// Occasional network issues and such can cause remote operations to fail, and
@@ -474,7 +472,7 @@ impl RemoteTimelineClient {
},
);
let (index_part, _index_generation) = download::download_index_part(
let index_part = download::download_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.timeline_id,
@@ -1718,11 +1716,6 @@ impl RemoteTimelineClient {
}
}
pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
let path = format!("tenants/{tenant_shard_id}");
RemotePath::from_string(&path).expect("Failed to construct path")
}
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
RemotePath::from_string(&path).expect("Failed to construct path")

View File

@@ -5,7 +5,6 @@
use std::collections::HashSet;
use std::future::Future;
use std::str::FromStr;
use anyhow::{anyhow, Context};
use camino::{Utf8Path, Utf8PathBuf};
@@ -26,13 +25,13 @@ use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
use crate::TEMP_FILE_SUFFIX;
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use utils::id::TimelineId;
use super::index::{IndexPart, LayerFileMetadata};
use super::{
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
INITDB_PATH,
};
///
@@ -183,7 +182,6 @@ async fn download_object<'a>(
#[cfg(target_os = "linux")]
crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
use bytes::BytesMut;
async {
let destination_file = VirtualFile::create(dst_path)
.await
@@ -196,10 +194,10 @@ async fn download_object<'a>(
// There's chunks_vectored() on the stream.
let (bytes_amount, destination_file) = async {
let size_tracking = size_tracking_writer::Writer::new(destination_file);
let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
size_tracking,
BytesMut::with_capacity(super::BUFFER_SIZE),
);
let mut buffered = owned_buffers_io::write::BufferedWriter::<
{ super::BUFFER_SIZE },
_,
>::new(size_tracking);
while let Some(res) =
futures::StreamExt::next(&mut download.download_stream).await
{
@@ -254,31 +252,42 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
}
}
async fn list_identifiers<T>(
/// List timelines of given tenant in remote storage
pub async fn list_remote_timelines(
storage: &GenericRemoteStorage,
prefix: RemotePath,
tenant_shard_id: TenantShardId,
cancel: CancellationToken,
) -> anyhow::Result<(HashSet<T>, HashSet<String>)>
where
T: FromStr + Eq + std::hash::Hash,
{
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
anyhow::bail!("storage-sync-list-remote-timelines");
});
let listing = download_retry_forever(
|| storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel),
&format!("list identifiers in prefix {prefix}"),
|| {
storage.list(
Some(&remote_path),
ListingMode::WithDelimiter,
None,
&cancel,
)
},
&format!("list timelines for {tenant_shard_id}"),
&cancel,
)
.await?;
let mut parsed_ids = HashSet::new();
let mut timeline_ids = HashSet::new();
let mut other_prefixes = HashSet::new();
for id_remote_storage_key in listing.prefixes {
let object_name = id_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}")
for timeline_remote_storage_key in listing.prefixes {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
})?;
match object_name.parse::<T>() {
Ok(t) => parsed_ids.insert(t),
match object_name.parse::<TimelineId>() {
Ok(t) => timeline_ids.insert(t),
Err(_) => other_prefixes.insert(object_name.to_string()),
};
}
@@ -290,31 +299,7 @@ where
other_prefixes.insert(object_name.to_string());
}
Ok((parsed_ids, other_prefixes))
}
/// List shards of given tenant in remote storage
pub(crate) async fn list_remote_tenant_shards(
storage: &GenericRemoteStorage,
tenant_id: TenantId,
cancel: CancellationToken,
) -> anyhow::Result<(HashSet<TenantShardId>, HashSet<String>)> {
let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id));
list_identifiers::<TenantShardId>(storage, remote_path, cancel).await
}
/// List timelines of given tenant shard in remote storage
pub async fn list_remote_timelines(
storage: &GenericRemoteStorage,
tenant_shard_id: TenantShardId,
cancel: CancellationToken,
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
anyhow::bail!("storage-sync-list-remote-timelines");
});
let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
list_identifiers::<TimelineId>(storage, remote_path, cancel).await
Ok((timeline_ids, other_prefixes))
}
async fn do_download_index_part(
@@ -323,7 +308,7 @@ async fn do_download_index_part(
timeline_id: &TimelineId,
index_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation), DownloadError> {
) -> Result<IndexPart, DownloadError> {
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
let index_part_bytes = download_retry_forever(
@@ -348,7 +333,7 @@ async fn do_download_index_part(
.with_context(|| format!("deserialize index part file at {remote_path:?}"))
.map_err(DownloadError::Other)?;
Ok((index_part, index_generation))
Ok(index_part)
}
/// index_part.json objects are suffixed with a generation number, so we cannot
@@ -357,13 +342,13 @@ async fn do_download_index_part(
/// In this function we probe for the most recent index in a generation <= our current generation.
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
pub(crate) async fn download_index_part(
pub(super) async fn download_index_part(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
my_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation), DownloadError> {
) -> Result<IndexPart, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
if my_generation.is_none() {

View File

@@ -118,9 +118,6 @@ pub(super) async fn gather_inputs(
ctx: &RequestContext,
) -> anyhow::Result<ModelInputs> {
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
//
// FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
// whole computation. It does not make sense from the billing perspective.
tenant
.refresh_gc_info(cancel, ctx)
.await

View File

@@ -148,29 +148,6 @@ impl ValuesReconstructState {
self.layers_visited
}
/// This function is called after reading a keyspace from a layer.
/// It checks if the read path has now moved past the cached Lsn for any keys.
///
/// Implementation note: We intentionally iterate over the keys for which we've
/// already collected some reconstruct data. This avoids scaling complexity with
/// the size of the search space.
pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
for (key, value) in self.keys.iter_mut() {
if !keyspace.contains(key) {
continue;
}
if let Ok(state) = value {
if state.situation != ValueReconstructSituation::Complete
&& state.get_cached_lsn() >= Some(advanced_to)
{
state.situation = ValueReconstructSituation::Complete;
self.keys_done.add_key(*key);
}
}
}
}
/// Update the state collected for a given key.
/// Returns true if this was the last value needed for the key and false otherwise.
///
@@ -195,18 +172,11 @@ impl ValuesReconstructState {
true
}
Value::WalRecord(rec) => {
debug_assert!(
Some(lsn) > state.get_cached_lsn(),
"Attempt to collect a record below cached LSN for walredo: {} < {}",
lsn,
state
.get_cached_lsn()
.expect("Assertion can only fire if a cached lsn is present")
);
let reached_cache =
state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
let will_init = rec.will_init();
state.records.push((lsn, rec));
will_init
will_init || reached_cache
}
},
};

View File

@@ -217,7 +217,6 @@ pub struct DeltaLayerInner {
// values copied from summary
index_start_blk: u32,
index_root_blk: u32,
lsn_range: Range<Lsn>,
file: VirtualFile,
file_id: FileId,
@@ -746,7 +745,6 @@ impl DeltaLayerInner {
file_id,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
lsn_range: actual_summary.lsn_range,
max_vectored_read_bytes,
}))
}
@@ -871,7 +869,7 @@ impl DeltaLayerInner {
let data_end_offset = self.index_start_offset();
let reads = Self::plan_reads(
&keyspace,
keyspace,
lsn_range,
data_end_offset,
index_reader,
@@ -885,13 +883,11 @@ impl DeltaLayerInner {
self.do_reads_and_update_state(reads, reconstruct_state)
.await;
reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
Ok(())
}
async fn plan_reads<Reader>(
keyspace: &KeySpace,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
data_end_offset: u64,
index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
@@ -1539,7 +1535,7 @@ mod test {
// Plan and validate
let vectored_reads = DeltaLayerInner::plan_reads(
&keyspace,
keyspace.clone(),
lsn_range.clone(),
disk_offset,
reader,
@@ -1791,7 +1787,7 @@ mod test {
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
let vectored_reads = DeltaLayerInner::plan_reads(
&keyspace,
keyspace.clone(),
entries_meta.lsn_range.clone(),
data_end_offset,
index_reader,

View File

@@ -17,7 +17,7 @@ use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, BinaryHeap, HashSet};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tracing::*;
@@ -78,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer {
}
pub struct InMemoryLayerInner {
/// All versions of all pages in the layer are kept here. Indexed
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The value is an offset into the
/// ephemeral file where the page version is stored.
index: BTreeMap<Key, VecMap<Lsn, u64>>,
index: HashMap<Key, VecMap<Lsn, u64>>,
/// The values are stored in a serialized format in this file.
/// Each serialized Value is preceded by a 'u32' length field.
@@ -384,24 +384,29 @@ impl InMemoryLayer {
let mut planned_block_reads = BinaryHeap::new();
for range in keyspace.ranges.iter() {
for (key, vec_map) in inner.index.range(range.start..range.end) {
let lsn_range = match reconstruct_state.get_cached_lsn(key) {
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
None => self.start_lsn..end_lsn,
};
let mut key = range.start;
while key < range.end {
if let Some(vec_map) = inner.index.get(&key) {
let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
None => self.start_lsn..end_lsn,
};
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
planned_block_reads.push(BlockRead {
key: *key,
lsn: *entry_lsn,
block_offset: *pos,
});
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
planned_block_reads.push(BlockRead {
key,
lsn: *entry_lsn,
block_offset: *pos,
});
}
}
key = key.next();
}
}
let keyspace_size = keyspace.total_raw_size();
let keyspace_size = keyspace.total_size();
let mut completed_keys = HashSet::new();
while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
@@ -433,8 +438,6 @@ impl InMemoryLayer {
}
}
reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
Ok(())
}
}
@@ -477,7 +480,7 @@ impl InMemoryLayer {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
let key = InMemoryLayerFileId(file.page_cache_file_id());
let key = InMemoryLayerFileId(file.id());
Ok(InMemoryLayer {
file_id: key,
@@ -494,7 +497,7 @@ impl InMemoryLayer {
end_lsn: OnceLock::new(),
opened_at: Instant::now(),
inner: RwLock::new(InMemoryLayerInner {
index: BTreeMap::new(),
index: HashMap::new(),
file,
resource_units: GlobalResourceUnits::new(),
}),
@@ -597,17 +600,14 @@ impl InMemoryLayer {
}
}
/// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
/// layer will only contain the key range the user specifies, and may return `None`
/// if there are no matching keys.
/// Write this frozen in-memory layer to disk.
///
/// Returns a new delta layer with all the same data as this in-memory layer
pub(crate) async fn write_to_disk(
&self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
key_range: Option<Range<Key>>,
) -> Result<Option<ResidentLayer>> {
) -> Result<ResidentLayer> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception
@@ -621,21 +621,6 @@ impl InMemoryLayer {
let end_lsn = *self.end_lsn.get().unwrap();
let keys: Vec<_> = if let Some(key_range) = key_range {
inner
.index
.iter()
.filter(|(k, _)| key_range.contains(k))
.map(|(k, m)| (k.to_i128(), m))
.collect()
} else {
inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
};
if keys.is_empty() {
return Ok(None);
}
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
@@ -649,17 +634,26 @@ impl InMemoryLayer {
let cursor = inner.file.block_cursor();
// Sort the keys because delta layer writer expects them sorted.
//
// NOTE: this sort can take up significant time if the layer has millions of
// keys. To speed up all the comparisons we convert the key to i128 and
// keep the value as a reference.
let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
keys.sort_unstable_by_key(|k| k.0);
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
for (key, vec_map) in inner.index.iter() {
for (key, vec_map) in keys.iter() {
let key = Key::from_i128(*key);
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
let will_init = Value::des(&buf)?.will_init();
let res;
(buf, res) = delta_layer_writer
.put_value_bytes(*key, *lsn, buf, will_init)
.put_value_bytes(key, *lsn, buf, will_init)
.await;
res?;
}
@@ -667,6 +661,6 @@ impl InMemoryLayer {
// MAX is used here because we identify L0 layers by full key range
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
Ok(Some(delta_layer))
Ok(delta_layer)
}
}

View File

@@ -336,12 +336,6 @@ impl Layer {
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
.await
.map_err(|err| match err {
GetVectoredError::Other(err) => GetVectoredError::Other(
err.context(format!("get_values_reconstruct_data for layer {self}")),
),
err => err,
})
}
/// Download the layer if evicted.
@@ -401,8 +395,8 @@ impl Layer {
&self.0.path
}
pub(crate) fn debug_str(&self) -> &Arc<str> {
&self.0.debug_str
pub(crate) fn local_path_str(&self) -> &Arc<str> {
&self.0.path_str
}
pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -527,8 +521,8 @@ struct LayerInner {
/// Full path to the file; unclear if this should exist anymore.
path: Utf8PathBuf,
/// String representation of the layer, used for traversal id.
debug_str: Arc<str>,
/// String representation of the full path, used for traversal id.
path_str: Arc<str>,
desc: PersistentLayerDesc,
@@ -735,7 +729,7 @@ impl LayerInner {
LayerInner {
conf,
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
path_str: path.to_string().into(),
path,
desc,
timeline: Arc::downgrade(timeline),

View File

@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind: BackgroundLoopKind,
_ctx: &RequestContext,
) -> tokio::sync::SemaphorePermit<'static> {
) -> impl Drop {
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
.with_label_values(&[loop_kind.as_static_str()])
.guard();

View File

@@ -17,7 +17,7 @@ use fail::fail_point;
use once_cell::sync::Lazy;
use pageserver_api::{
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
keyspace::KeySpaceAccum,
models::{
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
@@ -55,6 +55,7 @@ use std::{
ops::ControlFlow,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
@@ -65,7 +66,6 @@ use crate::{
disk_usage_eviction_task::DiskUsageEvictionInfo,
pgdatadir_mapping::CollectKeySpaceError,
};
use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
use crate::{
disk_usage_eviction_task::finite_f32,
tenant::storage_layer::{
@@ -119,8 +119,8 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -137,25 +137,6 @@ pub(super) enum FlushLoopState {
Exited,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum ImageLayerCreationMode {
/// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
Try,
/// Force creating the image layers if possible. For now, no image layers will be created
/// for metadata keys. Used in compaction code path with force flag enabled.
Force,
/// Initial ingestion of the data, and no data should be dropped in this function. This
/// means that no metadata keys should be included in the partitions. Used in flush frozen layer
/// code path.
Initial,
}
impl std::fmt::Display for ImageLayerCreationMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Hole {
@@ -336,7 +317,7 @@ pub struct Timeline {
pub initdb_lsn: Lsn,
/// When did we last calculate the partitioning?
partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
@@ -672,19 +653,6 @@ impl From<GetVectoredError> for CreateImageLayersError {
}
}
impl From<GetVectoredError> for PageReconstructError {
fn from(e: GetVectoredError) -> Self {
match e {
GetVectoredError::Cancelled => PageReconstructError::Cancelled,
GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
GetVectoredError::Other(err) => PageReconstructError::Other(err),
}
}
}
impl From<GetReadyAncestorError> for PageReconstructError {
fn from(e: GetReadyAncestorError) -> Self {
use GetReadyAncestorError::*;
@@ -714,23 +682,6 @@ pub enum GetVectoredImpl {
Vectored,
}
#[derive(
Eq,
PartialEq,
Debug,
Copy,
Clone,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
pub enum GetImpl {
Legacy,
Vectored,
}
pub(crate) enum WaitLsnWaiter<'a> {
Timeline(&'a Timeline),
Tenant,
@@ -792,6 +743,16 @@ impl Timeline {
key: Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
self.timeline_get_throttle.throttle(ctx, 1).await;
self.get_impl(key, lsn, ctx).await
}
/// Not subject to [`Self::timeline_get_throttle`].
async fn get_impl(
&self,
key: Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
if !lsn.is_valid() {
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
@@ -802,7 +763,13 @@ impl Timeline {
// page_service.
debug_assert!(!self.shard_identity.is_key_disposable(&key));
self.timeline_get_throttle.throttle(ctx, 1).await;
// XXX: structured stats collection for layer eviction here.
trace!(
"get page request for {}@{} from task kind {:?}",
key,
lsn,
ctx.task_kind()
);
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
// The cached image can be returned directly if there is no WAL between the cached image
@@ -825,85 +792,12 @@ impl Timeline {
None => None,
};
match self.conf.get_impl {
GetImpl::Legacy => {
let reconstruct_state = ValueReconstructState {
records: Vec::new(),
img: cached_page_img,
};
let mut reconstruct_state = ValueReconstructState {
records: Vec::new(),
img: cached_page_img,
};
self.get_impl(key, lsn, reconstruct_state, ctx).await
}
GetImpl::Vectored => {
let keyspace = KeySpace {
ranges: vec![key..key.next()],
};
// Initialise the reconstruct state for the key with the cache
// entry returned above.
let mut reconstruct_state = ValuesReconstructState::new();
let mut key_state = VectoredValueReconstructState::default();
key_state.img = cached_page_img;
reconstruct_state.keys.insert(key, Ok(key_state));
let vectored_res = self
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
.await;
if self.conf.validate_vectored_get {
self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
.await;
}
let key_value = vectored_res?.pop_first();
match key_value {
Some((got_key, value)) => {
if got_key != key {
error!(
"Expected {}, but singular vectored get returned {}",
key, got_key
);
Err(PageReconstructError::Other(anyhow!(
"Singular vectored get returned wrong key"
)))
} else {
value
}
}
None => {
error!(
"Expected {}, but singular vectored get returned nothing",
key
);
Err(PageReconstructError::Other(anyhow!(
"Singular vectored get did not return a value for {}",
key
)))
}
}
}
}
}
/// Not subject to [`Self::timeline_get_throttle`].
async fn get_impl(
&self,
key: Key,
lsn: Lsn,
mut reconstruct_state: ValueReconstructState,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
// XXX: structured stats collection for layer eviction here.
trace!(
"get page request for {}@{} from task kind {:?}",
key,
lsn,
ctx.task_kind()
);
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
.for_get_kind(GetKind::Singular)
.start_timer();
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
let path = self
.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
@@ -913,7 +807,7 @@ impl Timeline {
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
let elapsed = start.elapsed();
crate::metrics::RECONSTRUCT_TIME
.for_get_kind(GetKind::Singular)
.for_result(&res)
.observe(elapsed.as_secs_f64());
if cfg!(feature = "testing") && res.is_err() {
@@ -955,7 +849,7 @@ impl Timeline {
return Err(GetVectoredError::InvalidLsn(lsn));
}
let key_count = keyspace.total_raw_size().try_into().unwrap();
let key_count = keyspace.total_size().try_into().unwrap();
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
return Err(GetVectoredError::Oversized(key_count));
}
@@ -992,9 +886,7 @@ impl Timeline {
self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
}
GetVectoredImpl::Vectored => {
let vectored_res = self
.get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
.await;
let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
if self.conf.validate_vectored_get {
self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
@@ -1040,9 +932,7 @@ impl Timeline {
for range in keyspace.ranges {
let mut key = range.start;
while key != range.end {
let block = self
.get_impl(key, lsn, ValueReconstructState::default(), ctx)
.await;
let block = self.get_impl(key, lsn, ctx).await;
use PageReconstructError::*;
match block {
@@ -1060,23 +950,6 @@ impl Timeline {
// level error.
return Err(GetVectoredError::MissingKey(key));
}
Err(Other(err))
if err
.to_string()
.contains("downloading evicted layer file failed") =>
{
return Err(GetVectoredError::Other(err))
}
Err(Other(err))
if err
.chain()
.any(|cause| cause.to_string().contains("layer loading failed")) =>
{
// The intent here is to achieve error parity with the vectored read path.
// When vectored read fails to load a layer it fails the whole read, hence
// we mimic this behaviour here to keep the validation happy.
return Err(GetVectoredError::Other(err));
}
_ => {
values.insert(key, block);
key = key.next();
@@ -1092,25 +965,13 @@ impl Timeline {
&self,
keyspace: KeySpace,
lsn: Lsn,
mut reconstruct_state: ValuesReconstructState,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
let get_kind = if keyspace.total_raw_size() == 1 {
GetKind::Singular
} else {
GetKind::Vectored
};
let mut reconstruct_state = ValuesReconstructState::new();
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
.for_get_kind(get_kind)
.start_timer();
self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
.await?;
get_data_timer.stop_and_record();
let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
.for_get_kind(get_kind)
.start_timer();
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
let layers_visited = reconstruct_state.get_layers_visited();
for (key, res) in reconstruct_state.keys {
@@ -1126,7 +987,6 @@ impl Timeline {
}
}
}
reconstruct_timer.stop_and_record();
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
@@ -1168,11 +1028,6 @@ impl Timeline {
panic!(concat!("Sequential get failed with {}, but vectored get did not",
" - keyspace={:?} lsn={}"),
seq_err, keyspace, lsn) },
(Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
// Sequential get runs after vectored get, so it is possible for the later
// to time out while waiting for its ancestor's Lsn to become ready and for the
// former to succeed (it essentially has a doubled wait time).
},
(Ok(_), Err(vec_err)) => {
panic!(concat!("Vectored get failed with {}, but sequential get did not",
" - keyspace={:?} lsn={}"),
@@ -1253,12 +1108,6 @@ impl Timeline {
self.last_record_lsn.load()
}
/// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
/// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
self.last_record_lsn.status_receiver()
}
pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
self.disk_consistent_lsn.load()
}
@@ -1901,15 +1750,6 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
#[allow(dead_code)]
pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.switch_to_aux_file_v2
.unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
}
pub(crate) fn get_lazy_slru_download(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2129,10 +1969,7 @@ impl Timeline {
// initial logical size is 0.
LogicalSize::empty_initial()
},
partitioning: tokio::sync::Mutex::new((
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
Lsn(0),
)),
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
repartition_threshold: 0,
last_image_layer_creation_check_at: AtomicLsn::new(0),
@@ -2948,7 +2785,7 @@ trait TraversalLayerExt {
impl TraversalLayerExt for Layer {
fn traversal_id(&self) -> TraversalId {
Arc::clone(self.debug_str())
Arc::clone(self.local_path_str())
}
}
@@ -3134,6 +2971,7 @@ impl Timeline {
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
let layer = guard.get_from_desc(&layer);
drop(guard);
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3234,7 +3072,7 @@ impl Timeline {
}
}
if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
break;
}
@@ -3247,14 +3085,14 @@ impl Timeline {
timeline = &*timeline_owned;
}
if keyspace.total_raw_size() != 0 {
if keyspace.total_size() != 0 {
return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
}
Ok(())
}
/// Collect the reconstruct data for a keyspace from the specified timeline.
/// Collect the reconstruct data for a ketspace from the specified timeline.
///
/// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
/// the current keyspace. The current keyspace of the search at any given timeline
@@ -3289,62 +3127,56 @@ impl Timeline {
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
completed_keyspace.merge(&keys_done_last_step);
// Do not descent any further if the last layer we visited
// completed all keys in the keyspace it inspected. This is not
// required for correctness, but avoids visiting extra layers
// which turns out to be a perf bottleneck in some cases.
if !unmapped_keyspace.is_empty() {
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
});
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
});
match in_memory_layer {
Some(l) => {
let lsn_range = l.get_lsn_range().start..cont_lsn;
fringe.update(
ReadableLayer::InMemoryLayer(l),
unmapped_keyspace.clone(),
lsn_range,
);
}
None => {
for range in unmapped_keyspace.ranges.iter() {
let results = layers.range_search(range.clone(), cont_lsn);
match in_memory_layer {
Some(l) => {
let lsn_range = l.get_lsn_range().start..cont_lsn;
fringe.update(
ReadableLayer::InMemoryLayer(l),
unmapped_keyspace.clone(),
lsn_range,
);
}
None => {
for range in unmapped_keyspace.ranges.iter() {
let results = layers.range_search(range.clone(), cont_lsn);
results
.found
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
results
.found
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
}
// It's safe to drop the layer map lock after planning the next round of reads.
// The fringe keeps readable handles for the layers which are safe to read even
// if layers were compacted or flushed.
//
// The more interesting consideration is: "Why is the read algorithm still correct
// if the layer map changes while it is operating?". Doing a vectored read on a
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
// covered by the read. The layer map tells us how to move the lsn downwards for a
// range at *a particular point in time*. It is fine for the answer to be different
// at two different time points.
drop(guard);
}
// It's safe to drop the layer map lock after planning the next round of reads.
// The fringe keeps readable handles for the layers which are safe to read even
// if layers were compacted or flushed.
//
// The more interesting consideration is: "Why is the read algorithm still correct
// if the layer map changes while it is operating?". Doing a vectored read on a
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
// covered by the read. The layer map tells us how to move the lsn downwards for a
// range at *a particular point in time*. It is fine for the answer to be different
// at two different time points.
drop(guard);
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
let next_cont_lsn = lsn_range.start;
layer_to_read
@@ -3683,103 +3515,66 @@ impl Timeline {
// files instead. This is possible as long as *all* the data imported into the
// repository have the same LSN.
let lsn_range = frozen_layer.get_lsn_range();
// Whether to directly create image layers for this flush, or flush them as delta layers
let create_image_layer =
lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
#[cfg(test)]
{
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
expect_initdb_optimization,
initdb_optimization_count,
..
} => {
if create_image_layer {
let (layers_to_upload, delta_layer_to_add) =
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
initdb_optimization_count,
..
} => {
*initdb_optimization_count += 1;
} else {
}
}
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let (partitioning, _lsn) = self
.repartition(
self.initdb_lsn,
self.get_compaction_target_size(),
EnumSet::empty(),
ctx,
)
.await?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
// For image layers, we add them immediately into the layer map.
(
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
.await?,
None,
)
} else {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
expect_initdb_optimization,
..
} => {
assert!(!*expect_initdb_optimization, "expected initdb optimization");
}
}
}
}
let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let ((rel_partition, metadata_partition), _lsn) = self
.repartition(
self.initdb_lsn,
self.get_compaction_target_size(),
EnumSet::empty(),
ctx,
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
(
// FIXME: even though we have a single image and single delta layer assumption
// we push them to vec
vec![layer.clone()],
Some(layer),
)
.await?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
// For metadata, always create delta layers.
let delta_layer = if !metadata_partition.parts.is_empty() {
assert_eq!(
metadata_partition.parts.len(),
1,
"currently sparse keyspace should only contain a single aux file keyspace"
);
let metadata_keyspace = &metadata_partition.parts[0];
assert_eq!(
metadata_keyspace.0.ranges.len(),
1,
"aux file keyspace should be a single range"
);
self.create_delta_layer(
&frozen_layer,
ctx,
Some(metadata_keyspace.0.ranges[0].clone()),
)
.await?
} else {
None
};
// For image layers, we add them immediately into the layer map.
let mut layers_to_upload = Vec::new();
layers_to_upload.extend(
self.create_image_layers(
&rel_partition,
self.initdb_lsn,
ImageLayerCreationMode::Initial,
ctx,
)
.await?,
);
if let Some(delta_layer) = delta_layer {
layers_to_upload.push(delta_layer.clone());
(layers_to_upload, Some(delta_layer))
} else {
(layers_to_upload, None)
}
} else {
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
panic!("delta layer cannot be empty if no filter is applied");
};
(
// FIXME: even though we have a single image and single delta layer assumption
// we push them to vec
vec![layer.clone()],
Some(layer),
)
};
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
if self.cancel.is_cancelled() {
@@ -3899,18 +3694,12 @@ impl Timeline {
self: &Arc<Self>,
frozen_layer: &Arc<InMemoryLayer>,
ctx: &RequestContext,
key_range: Option<Range<Key>>,
) -> anyhow::Result<Option<ResidentLayer>> {
) -> anyhow::Result<ResidentLayer> {
let self_clone = Arc::clone(self);
let frozen_layer = Arc::clone(frozen_layer);
let ctx = ctx.attached_child();
let work = async move {
let Some(new_delta) = frozen_layer
.write_to_disk(&self_clone, &ctx, key_range)
.await?
else {
return Ok(None);
};
let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
// The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
// We just need to fsync the directory in which these inodes are linked,
// which we know to be the timeline directory.
@@ -3929,7 +3718,7 @@ impl Timeline {
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
anyhow::Ok(Some(new_delta))
anyhow::Ok(new_delta)
};
// Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
// Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3956,20 +3745,19 @@ impl Timeline {
partition_size: u64,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
// and hence before the compaction task starts.
anyhow::bail!("repartition() called concurrently, this should not happen");
};
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
if lsn < *partition_lsn {
if lsn < partitioning_guard.1 {
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
}
let distance = lsn.0 - partition_lsn.0;
if *partition_lsn != Lsn(0)
let distance = lsn.0 - partitioning_guard.1 .0;
if partitioning_guard.1 != Lsn(0)
&& distance <= self.repartition_threshold
&& !flags.contains(CompactFlags::ForceRepartition)
{
@@ -3978,24 +3766,37 @@ impl Timeline {
threshold = self.repartition_threshold,
"no repartitioning needed"
);
return Ok((
(dense_partition.clone(), sparse_partition.clone()),
*partition_lsn,
));
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
}
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
}; // no partitioning for metadata keys for now
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
let keyspace = self.collect_keyspace(lsn, ctx).await?;
let partitioning = keyspace.partition(partition_size);
*partitioning_guard = (partitioning, lsn);
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
}
// Is it time to create a new image layer for the given partition?
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
let last = self.last_image_layer_creation_check_at.load();
if lsn != Lsn(0) {
let distance = lsn
.checked_sub(last)
.expect("Attempt to compact with LSN going backwards");
let min_distance = self.get_image_layer_creation_check_threshold() as u64
* self.get_checkpoint_distance();
// Skip the expensive delta layer counting below if we've not ingested
// sufficient WAL since the last check.
if distance.0 < min_distance {
return false;
}
}
self.last_image_layer_creation_check_at.store(lsn);
let threshold = self.get_image_creation_threshold();
let guard = self.layers.read().await;
@@ -4045,12 +3846,12 @@ impl Timeline {
false
}
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
#[tracing::instrument(skip_all, fields(%lsn, %force))]
async fn create_image_layers(
self: &Arc<Timeline>,
partitioning: &KeyPartitioning,
lsn: Lsn,
mode: ImageLayerCreationMode,
force: bool,
ctx: &RequestContext,
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
let timer = self.metrics.create_images_time_histo.start_timer();
@@ -4067,46 +3868,11 @@ impl Timeline {
// image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.
let mut start = Key::MIN;
let check_for_image_layers = {
let last_checks_at = self.last_image_layer_creation_check_at.load();
let distance = lsn
.checked_sub(last_checks_at)
.expect("Attempt to compact with LSN going backwards");
let min_distance = self.get_image_layer_creation_check_threshold() as u64
* self.get_checkpoint_distance();
// Skip the expensive delta layer counting if this timeline has not ingested sufficient
// WAL since the last check.
distance.0 >= min_distance
};
if check_for_image_layers {
self.last_image_layer_creation_check_at.store(lsn);
}
for partition in partitioning.parts.iter() {
let img_range = start..partition.ranges.last().unwrap().end;
if partition.overlaps(&Key::metadata_key_range()) {
// TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
// rather big change. Keep this patch small for now.
match mode {
ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
// skip image layer creation anyways for metadata keys.
start = img_range.end;
continue;
}
ImageLayerCreationMode::Initial => {
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
}
}
} else if let ImageLayerCreationMode::Try = mode {
// check_for_image_layers = false -> skip
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
start = img_range.end;
continue;
}
if !force && !self.time_for_new_image_layer(partition, lsn).await {
start = img_range.end;
continue;
}
let mut image_layer_writer = ImageLayerWriter::new(
@@ -4147,7 +3913,7 @@ impl Timeline {
key = key.next();
// Maybe flush `key_rest_accum`
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
|| last_key_in_range
{
let results = self
@@ -4433,12 +4199,6 @@ impl Timeline {
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let _timer = self
.metrics
.update_gc_info_histo
.start_timer()
.record_on_drop();
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
//
// Some unit tests depend on garbage-collection working even when

View File

@@ -9,13 +9,13 @@ use std::ops::{Deref, Range};
use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
use anyhow::{anyhow, Context};
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_api::shard::TenantShardId;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::id::TimelineId;
@@ -102,7 +102,7 @@ impl Timeline {
)
.await
{
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
Ok((partitioning, lsn)) => {
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
let image_ctx = RequestContextBuilder::extend(ctx)
.access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,37 +115,17 @@ impl Timeline {
// 3. Create new image layers for partitions that have been modified
// "enough".
let dense_layers = self
let layers = self
.create_image_layers(
&dense_partitioning,
&partitioning,
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
flags.contains(CompactFlags::ForceImageLayerCreation),
&image_ctx,
)
.await
.map_err(anyhow::Error::from)?;
// For now, nothing will be produced...
let sparse_layers = self
.create_image_layers(
&sparse_partitioning.clone().into_dense(),
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
&image_ctx,
)
.await
.map_err(anyhow::Error::from)?;
assert!(sparse_layers.is_empty());
self.upload_new_image_layers(dense_layers)?;
self.upload_new_image_layers(layers)?;
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -778,9 +758,8 @@ impl Timeline {
return Err(CompactionError::ShuttingDown);
}
let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
// TODO(chi): ignore sparse_keyspace for now, compact it in the future.
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
pageserver_compaction::compact_tiered::compact_tiered(
&mut adaptor,
@@ -852,10 +831,6 @@ impl CompactionJobExecutor for TimelineAdaptor {
type RequestContext = crate::context::RequestContext;
fn get_shard_identity(&self) -> &ShardIdentity {
self.timeline.get_shard_identity()
}
async fn get_layers(
&mut self,
key_range: &Range<Key>,

View File

@@ -188,10 +188,24 @@ impl Timeline {
) -> ControlFlow<()> {
let now = SystemTime::now();
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Eviction,
ctx,
);
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
.await?;
let _permit = tokio::select! {
permit = acquire_permit => permit,
_ = cancel.cancelled() => return ControlFlow::Break(()),
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
};
match self
.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
.await
{
ControlFlow::Break(()) => return ControlFlow::Break(()),
ControlFlow::Continue(()) => (),
}
#[derive(Debug, Default)]
struct EvictionStats {
@@ -316,27 +330,19 @@ impl Timeline {
gate: &GateGuard,
ctx: &RequestContext,
) -> ControlFlow<()> {
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
.await
}
async fn acquire_imitation_permit(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Eviction,
ctx,
);
tokio::select! {
permit = acquire_permit => ControlFlow::Continue(permit),
_ = cancel.cancelled() => ControlFlow::Break(()),
_ = self.cancel.cancelled() => ControlFlow::Break(()),
}
let _permit = tokio::select! {
permit = acquire_permit => permit,
_ = cancel.cancelled() => return ControlFlow::Break(()),
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
};
self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
.await
}
/// If we evict layers but keep cached values derived from those layers, then
@@ -370,7 +376,6 @@ impl Timeline {
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
gate: &GateGuard,
permit: tokio::sync::SemaphorePermit<'static>,
ctx: &RequestContext,
) -> ControlFlow<()> {
if !self.tenant_shard_id.is_shard_zero() {
@@ -403,28 +408,7 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
let (mut state, _permit) = {
if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
(locked, permit)
} else {
// we might need to wait for a long time here in case of pathological synthetic
// size calculation performance
drop(permit);
let locked = tokio::select! {
locked = tenant.eviction_task_tenant_state.lock() => locked,
_ = self.cancel.cancelled() => {
return ControlFlow::Break(())
},
_ = cancel.cancelled() => {
return ControlFlow::Break(())
}
};
// then reacquire -- this will be bad if there is a lot of traffic, but because we
// released the permit, the overall latency will be much better.
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
(locked, permit)
}
};
let mut state = tenant.eviction_task_tenant_state.lock().await;
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
_ => {

View File

@@ -22,12 +22,10 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use pageserver_api::models::TimelineState;
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::proto::{
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
SubscribeByFilterRequest, TypeSubscription, TypedMessage,
};
use storage_broker::{BrokerClientChannel, Code, Streaming};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -91,14 +89,6 @@ pub(super) async fn connection_manager_loop_step(
.timeline
.subscribe_for_state_updates();
let mut wait_lsn_status = connection_manager_state
.timeline
.subscribe_for_wait_lsn_updates();
// TODO: create a separate config option for discovery request interval
let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
let mut last_discovery_ts: Option<std::time::Instant> = None;
// Subscribe to the broker updates. Stream shares underlying TCP connection
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
@@ -107,12 +97,10 @@ pub(super) async fn connection_manager_loop_step(
loop {
let time_until_next_retry = connection_manager_state.time_until_next_retry();
let any_activity = connection_manager_state.wal_connection.is_some()
|| !connection_manager_state.wal_stream_candidates.is_empty();
// These things are happening concurrently:
//
// - cancellation request
// - cancellation request
// - keep receiving WAL on the current connection
// - if the shared state says we need to change connection, disconnect and return
// - this runs in a separate task and we receive updates via a watch channel
@@ -120,7 +108,6 @@ pub(super) async fn connection_manager_loop_step(
// - receive updates from broker
// - this might change the current desired connection
// - timeline state changes to something that does not allow walreceiver to run concurrently
// - if there's no connection and no candidates, try to send a discovery request
// NB: make sure each of the select expressions are cancellation-safe
// (no need for arms to be cancellation-safe).
@@ -227,65 +214,6 @@ pub(super) async fn connection_manager_loop_step(
}
}
} => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
Some(()) = async {
// Reminder: this match arm needs to be cancellation-safe.
// Calculating time needed to wait until sending the next discovery request.
// Current implementation is conservative and sends discovery requests only when there are no candidates.
if any_activity {
// No need to send discovery requests if there is an active connection or candidates.
return None;
}
// Waiting for an active wait_lsn request.
while wait_lsn_status.borrow().is_none() {
if wait_lsn_status.changed().await.is_err() {
// wait_lsn_status channel was closed, exiting
warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
return None;
}
}
// All preconditions met, preparing to send a discovery request.
let now = std::time::Instant::now();
let next_discovery_ts = last_discovery_ts
.map(|ts| ts + discovery_request_interval)
.unwrap_or_else(|| now);
if next_discovery_ts > now {
// Prevent sending discovery requests too frequently.
tokio::time::sleep(next_discovery_ts - now).await;
}
let tenant_timeline_id = Some(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
});
let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
let msg = TypedMessage {
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
safekeeper_timeline_info: None,
safekeeper_discovery_request: Some(request),
safekeeper_discovery_response: None,
};
last_discovery_ts = Some(std::time::Instant::now());
debug!("No active connection and no candidates, sending discovery request to the broker");
// Cancellation safety: we want to send a message to the broker, but publish_one()
// function can get cancelled by the other select! arm. This is absolutely fine, because
// we just want to receive broker updates and discovery is not important if we already
// receive updates.
//
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
// This is totally fine because of the reason above.
// This is a fire-and-forget request, we don't care about the response
let _ = broker_client.publish_one(msg).await;
debug!("Discovery request sent to the broker");
None
} => {}
}
if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -303,7 +231,7 @@ async fn subscribe_for_timeline_updates(
broker_client: &mut BrokerClientChannel,
id: TenantTimelineId,
cancel: &CancellationToken,
) -> Result<Streaming<TypedMessage>, Cancelled> {
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
let mut attempt = 0;
loop {
exponential_backoff(
@@ -316,27 +244,17 @@ async fn subscribe_for_timeline_updates(
attempt += 1;
// subscribe to the specific timeline
let request = SubscribeByFilterRequest {
types: vec![
TypeSubscription {
r#type: MessageType::SafekeeperTimelineInfo as i32,
},
TypeSubscription {
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
},
],
tenant_timeline_id: Some(FilterTenantTimelineId {
enabled: true,
tenant_timeline_id: Some(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
}),
}),
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
});
let request = SubscribeSafekeeperInfoRequest {
subscription_key: Some(key),
};
match {
tokio::select! {
r = broker_client.subscribe_by_filter(request) => { r }
r = broker_client.subscribe_safekeeper_info(request) => { r }
_ = cancel.cancelled() => { return Err(Cancelled); }
}
} {
@@ -480,7 +398,7 @@ struct RetryInfo {
/// Data about the timeline to connect to, received from the broker.
#[derive(Debug, Clone)]
struct BrokerSkTimeline {
timeline: SafekeeperDiscoveryResponse,
timeline: SafekeeperTimelineInfo,
/// Time at which the data was fetched from the broker last time, to track the stale data.
latest_update: NaiveDateTime,
}
@@ -688,41 +606,7 @@ impl ConnectionManagerState {
}
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
let mut is_discovery = false;
let timeline_update = match typed_msg.r#type() {
MessageType::SafekeeperTimelineInfo => {
let info = match typed_msg.safekeeper_timeline_info {
Some(info) => info,
None => {
warn!("bad proto message from broker: no safekeeper_timeline_info");
return;
}
};
SafekeeperDiscoveryResponse {
safekeeper_id: info.safekeeper_id,
tenant_timeline_id: info.tenant_timeline_id,
commit_lsn: info.commit_lsn,
safekeeper_connstr: info.safekeeper_connstr,
availability_zone: info.availability_zone,
}
}
MessageType::SafekeeperDiscoveryResponse => {
is_discovery = true;
match typed_msg.safekeeper_discovery_response {
Some(response) => response,
None => {
warn!("bad proto message from broker: no safekeeper_discovery_response");
return;
}
}
}
_ => {
// unexpected message
return;
}
};
fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
WALRECEIVER_BROKER_UPDATES.inc();
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -735,11 +619,7 @@ impl ConnectionManagerState {
);
if old_entry.is_none() {
info!(
?is_discovery,
%new_safekeeper_id,
"New SK node was added",
);
info!("New SK node was added: {new_safekeeper_id}");
WALRECEIVER_CANDIDATES_ADDED.inc();
}
}
@@ -938,7 +818,7 @@ impl ConnectionManagerState {
fn select_connection_candidate(
&self,
node_to_omit: Option<NodeId>,
) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
self.applicable_connection_candidates()
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
.max_by_key(|(_, info, _)| info.commit_lsn)
@@ -948,7 +828,7 @@ impl ConnectionManagerState {
/// Some safekeepers are filtered by the retry cooldown.
fn applicable_connection_candidates(
&self,
) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
let now = Utc::now().naive_utc();
self.wal_stream_candidates
@@ -1088,11 +968,19 @@ mod tests {
latest_update: NaiveDateTime,
) -> BrokerSkTimeline {
BrokerSkTimeline {
timeline: SafekeeperDiscoveryResponse {
timeline: SafekeeperTimelineInfo {
safekeeper_id: 0,
tenant_timeline_id: None,
term: 0,
last_log_term: 0,
flush_lsn: 0,
commit_lsn,
backup_lsn: 0,
remote_consistent_lsn: 0,
peer_horizon_lsn: 0,
local_start_lsn: 0,
safekeeper_connstr: safekeeper_connstr.to_owned(),
http_connstr: safekeeper_connstr.to_owned(),
availability_zone: None,
},
latest_update,

View File

@@ -32,11 +32,11 @@ pub use io_engine::feature_test as io_engine_feature_test;
pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
mod metadata;
mod open_options;
use self::owned_buffers_io::write::OwnedAsyncWriter;
pub(crate) use io_engine::IoEngineKind;
pub(crate) use metadata::Metadata;
pub(crate) use open_options::*;
#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
pub(crate) mod owned_buffers_io {
//! Abstractions for IO with owned buffers.
//!
@@ -1083,17 +1083,6 @@ impl Drop for VirtualFile {
}
}
impl OwnedAsyncWriter for VirtualFile {
#[inline(always)]
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
) -> std::io::Result<(usize, B::Buf)> {
let (buf, res) = VirtualFile::write_all(self, buf).await;
res.map(move |v| (v, buf))
}
}
impl OpenFiles {
fn new(num_slots: usize) -> OpenFiles {
let mut slots = Box::new(Vec::with_capacity(num_slots));

View File

@@ -1,45 +1,33 @@
use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
use tokio_epoll_uring::{BoundedBuf, IoBuf};
pub struct Writer<W> {
dst: W,
pub struct Writer {
dst: VirtualFile,
bytes_amount: u64,
}
impl<W> Writer<W> {
pub fn new(dst: W) -> Self {
impl Writer {
pub fn new(dst: VirtualFile) -> Self {
Self {
dst,
bytes_amount: 0,
}
}
pub fn bytes_written(&self) -> u64 {
self.bytes_amount
}
pub fn as_inner(&self) -> &W {
&self.dst
}
/// Returns the wrapped `VirtualFile` object as well as the number
/// of bytes that were written to it through this object.
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub fn into_inner(self) -> (u64, W) {
pub fn into_inner(self) -> (u64, VirtualFile) {
(self.bytes_amount, self.dst)
}
}
impl<W> OwnedAsyncWriter for Writer<W>
where
W: OwnedAsyncWriter,
{
impl OwnedAsyncWriter for Writer {
#[inline(always)]
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
buf: B,
) -> std::io::Result<(usize, B::Buf)> {
let (nwritten, buf) = self.dst.write_all(buf).await?;
let (buf, res) = self.dst.write_all(buf).await;
let nwritten = res?;
self.bytes_amount += u64::try_from(nwritten).unwrap();
Ok((nwritten, buf))
}

View File

@@ -10,14 +10,14 @@ pub trait OwnedAsyncWriter {
) -> std::io::Result<(usize, B::Buf)>;
}
/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
/// small writes into larger writes of size [`Buffer::cap`].
/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
/// into `BUFFER_SIZE`-sized writes.
///
/// # Passthrough Of Large Writers
///
/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
/// cause the internal buffer to be flushed prematurely so that the large
/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
/// buffer to be flushed, even if it is not full yet. Then, the large
/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
///
/// This pass-through is generally beneficial for throughput, but if
/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
@@ -25,38 +25,27 @@ pub trait OwnedAsyncWriter {
///
/// In such cases, a different implementation that always buffers in memory
/// may be preferable.
pub struct BufferedWriter<B, W> {
pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
writer: W,
/// invariant: always remains Some(buf) except
/// - while IO is ongoing => goes back to Some() once the IO completed successfully
/// - after an IO error => stays `None` forever
/// In these exceptional cases, it's `None`.
buf: Option<B>,
// invariant: always remains Some(buf)
// with buf.capacity() == BUFFER_SIZE except
// - while IO is ongoing => goes back to Some() once the IO completed successfully
// - after an IO error => stays `None` forever
// In these exceptional cases, it's `None`.
buf: Option<BytesMut>,
}
impl<B, Buf, W> BufferedWriter<B, W>
impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
where
B: Buffer<IoBuf = Buf> + Send,
Buf: IoBuf + Send,
W: OwnedAsyncWriter,
{
pub fn new(writer: W, buf: B) -> Self {
pub fn new(writer: W) -> Self {
Self {
writer,
buf: Some(buf),
buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
}
}
pub fn as_inner(&self) -> &W {
&self.writer
}
/// Panics if used after any of the write paths returned an error
pub fn inspect_buffer(&self) -> &B {
self.buf()
}
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
self.flush().await?;
let Self { buf, writer } = self;
@@ -64,144 +53,61 @@ where
Ok(writer)
}
#[inline(always)]
fn buf(&self) -> &B {
self.buf
.as_ref()
.expect("must not use after we returned an error")
}
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
where
S: IoBuf + Send,
B: IoBuf + Send,
{
let chunk_len = chunk.len();
// avoid memcpy for the middle of the chunk
if chunk.len() >= self.buf().cap() {
if chunk.len() >= BUFFER_SIZE {
self.flush().await?;
// do a big write, bypassing `buf`
assert_eq!(
self.buf
.as_ref()
.expect("must not use after an error")
.pending(),
.len(),
0
);
let chunk_len = chunk.len();
let (nwritten, chunk) = self.writer.write_all(chunk).await?;
assert_eq!(nwritten, chunk_len);
return Ok((nwritten, chunk));
drop(chunk);
return Ok(());
}
// in-memory copy the < BUFFER_SIZED tail of the chunk
assert!(chunk.len() < self.buf().cap());
let mut slice = &chunk[..];
while !slice.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
let need = buf.cap() - buf.pending();
let have = slice.len();
let n = std::cmp::min(need, have);
buf.extend_from_slice(&slice[..n]);
slice = &slice[n..];
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
self.flush().await?;
}
}
assert!(slice.is_empty(), "by now we should have drained the chunk");
Ok((chunk_len, chunk.into_inner()))
}
/// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
///
/// It is less performant because we always have to copy the borrowed data into the internal buffer
/// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
/// for large writes.
pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
let chunk_len = chunk.len();
assert!(chunk.len() < BUFFER_SIZE);
let mut chunk = &chunk[..];
while !chunk.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
let need = buf.cap() - buf.pending();
let need = BUFFER_SIZE - buf.len();
let have = chunk.len();
let n = std::cmp::min(need, have);
buf.extend_from_slice(&chunk[..n]);
chunk = &chunk[n..];
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
if buf.len() >= BUFFER_SIZE {
assert_eq!(buf.len(), BUFFER_SIZE);
self.flush().await?;
}
}
Ok(chunk_len)
assert!(chunk.is_empty(), "by now we should have drained the chunk");
Ok(())
}
async fn flush(&mut self) -> std::io::Result<()> {
let buf = self.buf.take().expect("must not use after an error");
let buf_len = buf.pending();
if buf_len == 0 {
if buf.is_empty() {
self.buf = Some(buf);
return Ok(());
return std::io::Result::Ok(());
}
let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
let buf_len = buf.len();
let (nwritten, mut buf) = self.writer.write_all(buf).await?;
assert_eq!(nwritten, buf_len);
self.buf = Some(Buffer::reuse_after_flush(io_buf));
buf.clear();
self.buf = Some(buf);
Ok(())
}
}
/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones.
pub trait Buffer {
type IoBuf: IoBuf;
/// Capacity of the buffer. Must not change over the lifetime `self`.`
fn cap(&self) -> usize;
/// Add data to the buffer.
/// Panics if there is not enough room to accomodate `other`'s content, i.e.,
/// panics if `other.len() > self.cap() - self.pending()`.
fn extend_from_slice(&mut self, other: &[u8]);
/// Number of bytes in the buffer.
fn pending(&self) -> usize;
/// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
/// so we can use [`tokio_epoll_uring`] to write it to disk.
fn flush(self) -> Slice<Self::IoBuf>;
/// After the write to disk is done and we have gotten back the slice,
/// [`BufferedWriter`] uses this method to re-use the io buffer.
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
}
impl Buffer for BytesMut {
type IoBuf = BytesMut;
#[inline(always)]
fn cap(&self) -> usize {
self.capacity()
}
fn extend_from_slice(&mut self, other: &[u8]) {
BytesMut::extend_from_slice(self, other)
}
#[inline(always)]
fn pending(&self) -> usize {
self.len()
}
fn flush(self) -> Slice<BytesMut> {
if self.is_empty() {
return self.slice_full();
}
let len = self.len();
self.slice(0..len)
}
fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
iobuf.clear();
iobuf
}
}
impl OwnedAsyncWriter for Vec<u8> {
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
@@ -219,8 +125,6 @@ impl OwnedAsyncWriter for Vec<u8> {
#[cfg(test)]
mod tests {
use bytes::BytesMut;
use super::*;
#[derive(Default)]
@@ -254,7 +158,7 @@ mod tests {
#[tokio::test]
async fn test_buffered_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
let mut writer = BufferedWriter::<2, _>::new(recorder);
write!(writer, b"a");
write!(writer, b"b");
write!(writer, b"c");
@@ -271,7 +175,7 @@ mod tests {
#[tokio::test]
async fn test_passthrough_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
let mut writer = BufferedWriter::<2, _>::new(recorder);
write!(writer, b"abc");
write!(writer, b"de");
write!(writer, b"");
@@ -287,7 +191,7 @@ mod tests {
#[tokio::test]
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
let mut writer = BufferedWriter::<2, _>::new(recorder);
write!(writer, b"a");
write!(writer, b"bc");
write!(writer, b"d");
@@ -299,31 +203,4 @@ mod tests {
);
Ok(())
}
#[tokio::test]
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
writer.write_buffered_borrowed(b"abc").await?;
writer.write_buffered_borrowed(b"d").await?;
writer.write_buffered_borrowed(b"e").await?;
writer.write_buffered_borrowed(b"fg").await?;
writer.write_buffered_borrowed(b"hi").await?;
writer.write_buffered_borrowed(b"j").await?;
writer.write_buffered_borrowed(b"klmno").await?;
let recorder = writer.flush_and_into_inner().await?;
assert_eq!(
recorder.writes,
{
let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
expect
}
.iter()
.map(|v| v[..].to_vec())
.collect::<Vec<_>>()
);
Ok(())
}
}

View File

@@ -1034,7 +1034,7 @@ impl WalIngest {
let nblocks = modification
.tline
.get_rel_size(src_rel, Version::Modified(modification), ctx)
.get_rel_size(src_rel, Version::Modified(modification), true, ctx)
.await?;
let dst_rel = RelTag {
spcnode: tablespace_id,
@@ -1068,7 +1068,13 @@ impl WalIngest {
let content = modification
.tline
.get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
.get_rel_page_at_lsn(
src_rel,
blknum,
Version::Modified(modification),
true,
ctx,
)
.await?;
modification.put_rel_page_image(dst_rel, blknum, content)?;
num_blocks_copied += 1;
@@ -1236,7 +1242,7 @@ impl WalIngest {
};
if modification
.tline
.get_rel_exists(rel, Version::Modified(modification), ctx)
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
self.put_rel_drop(modification, rel, ctx).await?;
@@ -1535,7 +1541,7 @@ impl WalIngest {
nblocks
} else if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), ctx)
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
@@ -1547,7 +1553,7 @@ impl WalIngest {
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
};
@@ -1644,14 +1650,14 @@ async fn get_relsize(
) -> anyhow::Result<BlockNumber> {
let nblocks = if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), ctx)
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
};
Ok(nblocks)
@@ -1726,29 +1732,29 @@ mod tests {
// The relation was created at LSN 2, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
1
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
3
);
@@ -1756,46 +1762,46 @@ mod tests {
// Check page contents at each LSN
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
test_img("foo blk 0 at 2")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
.await?,
test_img("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
test_img("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
test_img("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
test_img("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
test_img("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
test_img("foo blk 2 at 5")
);
@@ -1811,19 +1817,19 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
test_img("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
test_img("foo blk 1 at 4")
);
@@ -1831,13 +1837,13 @@ mod tests {
// should still see the truncated block with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
3
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
test_img("foo blk 2 at 5")
);
@@ -1850,7 +1856,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
.await?,
0
);
@@ -1863,19 +1869,19 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
.await?,
ZERO_PAGE
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
.await?,
test_img("foo blk 1")
);
@@ -1888,21 +1894,21 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
1501
);
for blk in 2..1500 {
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
ZERO_PAGE
);
}
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
test_img("foo blk 1500")
);
@@ -1929,13 +1935,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
1
);
@@ -1948,7 +1954,7 @@ mod tests {
// Check that rel is not visible anymore
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
.await?,
false
);
@@ -1966,13 +1972,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
1
);
@@ -2005,24 +2011,24 @@ mod tests {
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
relsize
);
@@ -2033,7 +2039,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
.await?,
test_img(&data)
);
@@ -2050,7 +2056,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
1
);
@@ -2060,7 +2066,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
test_img(&data)
);
@@ -2069,7 +2075,7 @@ mod tests {
// should still see all blocks with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
relsize
);
@@ -2078,7 +2084,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
test_img(&data)
);
@@ -2098,13 +2104,13 @@ mod tests {
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
relsize
);
@@ -2114,7 +2120,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
test_img(&data)
);
@@ -2148,7 +2154,7 @@ mod tests {
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
RELSEG_SIZE + 1
);
@@ -2162,7 +2168,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
RELSEG_SIZE
);
@@ -2177,7 +2183,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
RELSEG_SIZE - 1
);
@@ -2195,7 +2201,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
size as BlockNumber
);

View File

@@ -49,8 +49,6 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
int neon_protocol_version = 1;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
static int stripe_size;
@@ -381,17 +379,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
pfree(msg);
return false;
}
switch (neon_protocol_version)
{
case 2:
query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
break;
case 1:
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
break;
default:
elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
}
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
ret = PQsendQuery(conn, query);
pfree(query);
if (ret != 1)
@@ -452,7 +440,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
return false;
}
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
page_servers[shard_no].conn = conn;
page_servers[shard_no].wes = wes;
@@ -856,16 +844,6 @@ pg_init_libpagestore(void)
PGC_USERSET,
0, /* no flags required */
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
DefineCustomIntVariable("neon.protocol_version",
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
1, /* default to old protocol for now */
1, /* min */
2, /* max */
PGC_SU_BACKEND,
0, /* no flags required */
NULL, NULL, NULL);
relsize_hash_init();

View File

@@ -69,33 +69,18 @@ typedef enum {
SLRU_MULTIXACT_OFFSETS
} SlruKind;
/*--
* supertype of all the Neon*Request structs below.
/*
* supertype of all the Neon*Request structs below
*
* All requests contain two LSNs:
*
* lsn: request page (or relation size, etc) at this LSN
* not_modified_since: Hint that the page hasn't been modified between
* this LSN and the request LSN (`lsn`).
*
* To request the latest version of a page, you can use MAX_LSN as the request
* LSN.
*
* If you don't know any better, you can always set 'not_modified_since' equal
* to 'lsn', but providing a lower value can speed up processing the request
* in the pageserver, as it doesn't need to wait for the WAL to arrive, and it
* can skip traversing through recent layers which we know to not contain any
* versions for the requested page.
*
* These structs describe the V2 of these requests. The old V1 protocol contained
* just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
* set to 1, we will convert these to the V1 requests before sending.
* If 'latest' is true, we are requesting the latest page version, and 'lsn'
* is just a hint to the server that we know there are no versions of the page
* (or relation size, for exists/nblocks requests) later than the 'lsn'.
*/
typedef struct
{
NeonMessageTag tag;
XLogRecPtr lsn;
XLogRecPtr not_modified_since;
bool latest; /* if true, request latest page version */
XLogRecPtr lsn; /* request page version @ this LSN */
} NeonRequest;
typedef struct
@@ -208,7 +193,6 @@ extern int readahead_buffer_size;
extern char *neon_timeline;
extern char *neon_tenant;
extern int32 max_cluster_size;
extern int neon_protocol_version;
extern shardno_t get_shard_number(BufferTag* tag);
@@ -241,14 +225,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
XLogRecPtr request_lsn, bool request_latest, char *buffer);
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
#else
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void *buffer);
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
XLogRecPtr request_lsn, bool request_latest, void *buffer);
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void *buffer, bool skipFsync);
#endif

View File

@@ -168,8 +168,8 @@ typedef enum PrefetchStatus
typedef struct PrefetchRequest
{
BufferTag buftag; /* must be first entry in the struct */
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
XLogRecPtr effective_request_lsn;
XLogRecPtr actual_request_lsn;
NeonResponse *response; /* may be null */
PrefetchStatus status;
shardno_t shard_no;
@@ -269,19 +269,19 @@ static PrefetchState *MyPState;
) \
)
static XLogRecPtr prefetch_lsn = 0;
static bool compact_prefetch_buffers(void);
static void consume_prefetch_responses(void);
static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
static bool prefetch_read(PrefetchRequest *slot);
static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
static bool prefetch_wait_for(uint64 ring_index);
static void prefetch_cleanup_trailing_unused(void);
static inline void prefetch_set_unused(uint64 ring_index);
static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
PrefetchRequest *slot);
static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
ForkNumber forknum, BlockNumber blkno);
static bool
compact_prefetch_buffers(void)
@@ -338,8 +338,8 @@ compact_prefetch_buffers(void)
target_slot->shard_no = source_slot->shard_no;
target_slot->status = source_slot->status;
target_slot->response = source_slot->response;
target_slot->request_lsn = source_slot->request_lsn;
target_slot->not_modified_since = source_slot->not_modified_since;
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
target_slot->actual_request_lsn = source_slot->actual_request_lsn;
target_slot->my_ring_index = empty_ring_index;
prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,8 +358,7 @@ compact_prefetch_buffers(void)
};
source_slot->response = NULL;
source_slot->my_ring_index = 0;
source_slot->request_lsn = InvalidXLogRecPtr;
source_slot->not_modified_since = InvalidXLogRecPtr;
source_slot->effective_request_lsn = 0;
/* update bookkeeping */
n_moved++;
@@ -684,39 +683,56 @@ prefetch_set_unused(uint64 ring_index)
compact_prefetch_buffers();
}
/*
* Send one prefetch request to the pageserver. To wait for the response, call
* prefetch_wait_for().
*/
static void
prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
{
bool found;
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
/* lsn and not_modified_since are filled in below */
.req.latest = false,
.req.lsn = 0,
.rinfo = BufTagGetNRelFileInfo(slot->buftag),
.forknum = slot->buftag.forkNum,
.blkno = slot->buftag.blockNum,
};
Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
if (force_request_lsn)
if (force_lsn && force_latest)
{
request.req.lsn = *force_request_lsn;
request.req.not_modified_since = *force_not_modified_since;
request.req.lsn = *force_lsn;
request.req.latest = *force_latest;
slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn;
}
else
{
neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum,
slot->buftag.blockNum,
&request.req.lsn,
&request.req.not_modified_since);
XLogRecPtr lsn = neon_get_request_lsn(
&request.req.latest,
BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum,
slot->buftag.blockNum
);
/*
* Note: effective_request_lsn is potentially higher than the
* requested LSN, but still correct:
*
* We know there are no changes between the actual requested LSN and
* the value of effective_request_lsn: If there were, the page would
* have been in cache and evicted between those LSN values, which then
* would have had to result in a larger request LSN for this page.
*
* It is possible that a concurrent backend loads the page, modifies
* it and then evicts it again, but the LSN of that eviction cannot be
* smaller than the current WAL insert/redo pointer, which is already
* larger than this prefetch_lsn. So in any case, that would
* invalidate this cache.
*
* The best LSN to use for effective_request_lsn would be
* XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
*/
slot->actual_request_lsn = request.req.lsn = lsn;
prefetch_lsn = Max(prefetch_lsn, lsn);
slot->effective_request_lsn = prefetch_lsn;
}
slot->request_lsn = request.req.lsn;
slot->not_modified_since = request.req.not_modified_since;
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -733,6 +749,7 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
/* update slot state */
slot->status = PRFS_REQUESTED;
prfh_insert(MyPState->prf_hash, slot, &found);
Assert(!found);
}
@@ -742,25 +759,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
*
* Register that we may want the contents of BufferTag in the near future.
*
* If force_request_lsn and force_not_modified_since are not NULL, those
* values are sent to the pageserver. If they are NULL, we utilize the
* lastWrittenLsn -infrastructure to fill them in.
* If force_latest and force_lsn are not NULL, those values are sent to the
* pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
* to fill in these values manually.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static uint64
prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
XLogRecPtr *force_not_modified_since)
prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
{
uint64 ring_index;
PrefetchRequest req;
PrefetchRequest *slot;
PrfHashEntry *entry;
Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
req.buftag = tag;
Retry:
@@ -778,19 +792,40 @@ Retry:
Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
/*
* If the caller specified a request LSN to use, only accept prefetch
* responses that satisfy that request.
* If we want a specific lsn, we do not accept requests that were made
* with a potentially different LSN.
*/
if (force_request_lsn)
if (force_latest && force_lsn)
{
if (!neon_prefetch_response_usable(*force_request_lsn,
*force_not_modified_since, slot))
/*
* if we want the latest version, any effective_request_lsn <
* request lsn is OK
*/
if (*force_latest)
{
/* Wait for the old request to finish and discard it */
if (!prefetch_wait_for(ring_index))
goto Retry;
prefetch_set_unused(ring_index);
entry = NULL;
if (*force_lsn > slot->effective_request_lsn)
{
if (!prefetch_wait_for(ring_index))
goto Retry;
prefetch_set_unused(ring_index);
entry = NULL;
}
}
/*
* if we don't want the latest version, only accept requests with
* the exact same LSN
*/
else
{
if (*force_lsn != slot->effective_request_lsn)
{
if (!prefetch_wait_for(ring_index))
goto Retry;
prefetch_set_unused(ring_index);
entry = NULL;
}
}
}
@@ -886,7 +921,7 @@ Retry:
slot->shard_no = get_shard_number(&tag);
slot->my_ring_index = ring_index;
prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
prefetch_do_request(slot, force_latest, force_lsn);
Assert(slot->status == PRFS_REQUESTED);
Assert(MyPState->ring_last <= ring_index &&
ring_index < MyPState->ring_unused);
@@ -915,7 +950,7 @@ page_server_request(void const *req)
BufferTag tag = {0};
shardno_t shard_no;
switch (messageTag(req))
switch (((NeonRequest *) req)->tag)
{
case T_NeonExistsRequest:
CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
@@ -931,10 +966,11 @@ page_server_request(void const *req)
tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
break;
default:
neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
}
shard_no = get_shard_number(&tag);
/*
* Current sharding model assumes that all metadata is present only at shard 0.
* We still need to call get_shard_no() to check if shard map is up-to-date.
@@ -961,52 +997,8 @@ nm_pack_request(NeonRequest *msg)
StringInfoData s;
initStringInfo(&s);
pq_sendbyte(&s, msg->tag);
if (neon_protocol_version >= 2)
{
pq_sendbyte(&s, msg->tag);
pq_sendint64(&s, msg->lsn);
pq_sendint64(&s, msg->not_modified_since);
}
else
{
bool latest;
XLogRecPtr lsn;
/*
* In primary, we always request the latest page version.
*/
if (!RecoveryInProgress())
{
latest = true;
lsn = msg->not_modified_since;
}
else
{
/*
* In the protocol V1, we cannot represent that we want to read
* page at LSN X, and we know that it hasn't been modified since
* Y. We can either use 'not_modified_lsn' as the request LSN, and
* risk getting an error if that LSN is too old and has already
* fallen out of the pageserver's GC horizon, or we can send
* 'request_lsn', causing the pageserver to possibly wait for the
* recent WAL to arrive unnecessarily. Or something in between. We
* choose to use the old LSN and risk GC errors, because that's
* what we've done historically.
*/
latest = false;
lsn = msg->not_modified_since;
}
pq_sendbyte(&s, msg->tag);
pq_sendbyte(&s, latest);
pq_sendint64(&s, lsn);
}
/*
* The rest of the request messages are the same between protocol V1 and
* V2
*/
switch (messageTag(msg))
{
/* pagestore_client -> pagestore */
@@ -1014,6 +1006,8 @@ nm_pack_request(NeonRequest *msg)
{
NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
pq_sendbyte(&s, msg_req->req.latest);
pq_sendint64(&s, msg_req->req.lsn);
pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1025,6 +1019,8 @@ nm_pack_request(NeonRequest *msg)
{
NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
pq_sendbyte(&s, msg_req->req.latest);
pq_sendint64(&s, msg_req->req.lsn);
pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1036,6 +1032,8 @@ nm_pack_request(NeonRequest *msg)
{
NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
pq_sendbyte(&s, msg_req->req.latest);
pq_sendint64(&s, msg_req->req.lsn);
pq_sendint32(&s, msg_req->dbNode);
break;
@@ -1044,6 +1042,8 @@ nm_pack_request(NeonRequest *msg)
{
NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
pq_sendbyte(&s, msg_req->req.latest);
pq_sendint64(&s, msg_req->req.lsn);
pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1057,6 +1057,8 @@ nm_pack_request(NeonRequest *msg)
{
NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
pq_sendbyte(&s, msg_req->req.latest);
pq_sendint64(&s, msg_req->req.lsn);
pq_sendbyte(&s, msg_req->kind);
pq_sendint32(&s, msg_req->segno);
@@ -1207,7 +1209,7 @@ nm_to_string(NeonMessage *msg)
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
appendStringInfoChar(&s, '}');
break;
}
@@ -1220,7 +1222,7 @@ nm_to_string(NeonMessage *msg)
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
appendStringInfoChar(&s, '}');
break;
}
@@ -1234,7 +1236,7 @@ nm_to_string(NeonMessage *msg)
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
appendStringInfoChar(&s, '}');
break;
}
@@ -1245,7 +1247,7 @@ nm_to_string(NeonMessage *msg)
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
appendStringInfoChar(&s, '}');
break;
}
@@ -1257,7 +1259,7 @@ nm_to_string(NeonMessage *msg)
appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
appendStringInfoChar(&s, '}');
break;
}
@@ -1529,38 +1531,44 @@ nm_adjust_lsn(XLogRecPtr lsn)
/*
* Return LSN for requesting pages and number of blocks from page server
*/
static void
neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
static XLogRecPtr
neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
{
XLogRecPtr last_written_lsn;
last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
last_written_lsn = nm_adjust_lsn(last_written_lsn);
Assert(last_written_lsn != InvalidXLogRecPtr);
XLogRecPtr lsn;
if (RecoveryInProgress())
{
/* Request the page at the last replayed LSN. */
*request_lsn = GetXLogReplayRecPtr(NULL);
*not_modified_since = last_written_lsn;
Assert(last_written_lsn <= *request_lsn);
/*
* We don't know if WAL has been generated but not yet replayed, so
* we're conservative in our estimates about latest pages.
*/
*latest = false;
neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
/*
* Get the last written LSN of this page.
*/
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
lsn = nm_adjust_lsn(lsn);
neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
(uint32) ((lsn) >> 32), (uint32) (lsn));
}
else
{
XLogRecPtr flushlsn;
/*
* Use the latest LSN that was evicted from the buffer cache as the
* 'not_modified_since' hint. Any pages modified by later WAL records
* must still in the buffer cache, so our request cannot concern
* those.
* Use the latest LSN that was evicted from the buffer cache. Any
* pages modified by later WAL records must still in the buffer cache,
* so our request cannot concern those.
*/
*latest = true;
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
Assert(lsn != InvalidXLogRecPtr);
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
LSN_FORMAT_ARGS(last_written_lsn));
(uint32) ((lsn) >> 32), (uint32) (lsn));
lsn = nm_adjust_lsn(lsn);
/*
* Is it possible that the last-written LSN is ahead of last flush
@@ -1575,109 +1583,16 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
#else
flushlsn = GetFlushRecPtr();
#endif
if (last_written_lsn > flushlsn)
if (lsn > flushlsn)
{
neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
LSN_FORMAT_ARGS(last_written_lsn),
LSN_FORMAT_ARGS(flushlsn));
XLogFlush(last_written_lsn);
flushlsn = last_written_lsn;
(uint32) (lsn >> 32), (uint32) lsn,
(uint32) (flushlsn >> 32), (uint32) flushlsn);
XLogFlush(lsn);
}
/*
* Request the latest version of the page. The most up-to-date request
* LSN we could use would be the current insert LSN, but to avoid the
* overhead of looking it up, use 'flushlsn' instead. This relies on
* the assumption that if the page was modified since the last WAL
* flush, it should still be in the buffer cache, and we wouldn't be
* requesting it.
*/
*request_lsn = flushlsn;
*not_modified_since = last_written_lsn;
}
}
/*
* neon_prefetch_response_usable -- Can a new request be satisfied by old one?
*
* This is used to check if the response to a prefetch request can be used to
* satisfy a page read now.
*/
static bool
neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
PrefetchRequest *slot)
{
/* sanity check the LSN's on the old and the new request */
Assert(request_lsn >= not_modified_since);
Assert(slot->request_lsn >= slot->not_modified_since);
Assert(slot->status != PRFS_UNUSED);
/*
* The new request's LSN should never be older than the old one. This
* could be an Assert, except that for testing purposes, we do provide an
* interface in neon_test_utils to fetch pages at arbitary LSNs, which
* violates this.
*
* Similarly, the not_modified_since value calculated for a page should
* never move backwards. This assumption is a bit fragile; if we updated
* the last-written cache when we read in a page, for example, then it
* might. But as the code stands, it should not.
*
* (If two backends issue a request at the same time, they might race and
* calculate LSNs "out of order" with each other, but the prefetch queue
* is backend-private at the moment.)
*/
if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
{
ereport(LOG,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
return false;
}
/*---
* Each request to the pageserver carries two LSN values:
* `not_modified_since` and `request_lsn`. The (not_modified_since,
* request_lsn] range of each request is effectively a claim that the page
* has not been modified between those LSNs. If the range of the old
* request in the queue overlaps with the new request, we know that the
* page hasn't been modified in the union of the ranges. We can use the
* response to old request to satisfy the new request in that case. For
* example:
*
* 100 500
* Old request: +--------+
*
* 400 800
* New request: +--------+
*
* The old request claims that the page was not modified between LSNs 100
* and 500, and the second claims that it was not modified between 400 and
* 800. Together they mean that the page was not modified between 100 and
* 800. Therefore the response to the old request is also valid for the
* new request.
*
* This logic also holds at the boundary case that the old request's LSN
* matches the new request's not_modified_since LSN exactly:
*
* 100 500
* Old request: +--------+
*
* 500 900
* New request: +--------+
*
* The response to the old request is the page as it was at LSN 500, and
* the page hasn't been changed in the range (500, 900], therefore the
* response is valid also for the new request.
*/
/* this follows from the checks above */
Assert(request_lsn >= slot->not_modified_since);
return not_modified_since <= slot->request_lsn;
return lsn;
}
/*
@@ -1689,8 +1604,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
bool exists;
NeonResponse *resp;
BlockNumber n_blocks;
bool latest;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
switch (reln->smgr_relpersistence)
{
@@ -1745,13 +1660,12 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return false;
}
neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
&request_lsn, &not_modified_since);
request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
{
NeonExistsRequest request = {
.req.tag = T_NeonExistsRequest,
.req.latest = latest,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forkNum};
@@ -2188,10 +2102,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
void
#if PG_MAJORVERSION_NUM < 16
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
XLogRecPtr request_lsn, bool request_latest, char *buffer)
#else
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
XLogRecPtr request_lsn, bool request_latest, void *buffer)
#endif
{
NeonResponse *resp;
@@ -2234,16 +2148,15 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
if (entry != NULL)
{
slot = entry->slot;
if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
if (slot->effective_request_lsn >= request_lsn)
{
ring_index = slot->my_ring_index;
pgBufferUsage.prefetch.hits += 1;
}
else
else /* the current prefetch LSN is not large
* enough, so drop the prefetch */
{
/*
* Cannot use this prefetch, discard it
*
* We can't drop cache for not-yet-received requested items. It is
* unlikely this happens, but it can happen if prefetch distance
* is large enough and a backend didn't consume all prefetch
@@ -2268,8 +2181,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
{
pgBufferUsage.prefetch.misses += 1;
ring_index = prefetch_register_buffer(buftag, &request_lsn,
&not_modified_since);
ring_index = prefetch_register_buffer(buftag, &request_latest,
&request_lsn);
slot = GetPrfSlot(ring_index);
}
else
@@ -2333,8 +2246,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
#endif
{
bool latest;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
switch (reln->smgr_relpersistence)
{
@@ -2359,9 +2272,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
return;
}
neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
&request_lsn, &not_modified_since);
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno);
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer);
#ifdef DEBUG_COMPARE_LOCAL
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2530,8 +2442,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
{
NeonResponse *resp;
BlockNumber n_blocks;
bool latest;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
switch (reln->smgr_relpersistence)
{
@@ -2558,13 +2470,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
return n_blocks;
}
neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
&request_lsn, &not_modified_since);
request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
{
NeonNblocksRequest request = {
.req.tag = T_NeonNblocksRequest,
.req.latest = latest,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forknum,
};
@@ -2612,17 +2523,16 @@ neon_dbsize(Oid dbNode)
{
NeonResponse *resp;
int64 db_size;
XLogRecPtr request_lsn,
not_modified_since;
XLogRecPtr request_lsn;
bool latest;
NRelFileInfo dummy_node = {0};
neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
&request_lsn, &not_modified_since);
request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
{
NeonDbSizeRequest request = {
.req.tag = T_NeonDbSizeRequest,
.req.latest = latest,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.dbNode = dbNode,
};
@@ -2695,6 +2605,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
/*
@@ -2894,33 +2805,14 @@ neon_end_unlogged_build(SMgrRelation reln)
static int
neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
{
XLogRecPtr request_lsn,
not_modified_since;
if (RecoveryInProgress())
{
request_lsn = GetXLogReplayRecPtr(NULL);
if (request_lsn == InvalidXLogRecPtr)
{
/*
* This happens in neon startup, we start up without replaying any
* records.
*/
request_lsn = GetRedoStartLsn();
}
}
else
request_lsn = GetXLogInsertRecPtr();
request_lsn = nm_adjust_lsn(request_lsn);
XLogRecPtr request_lsn;
/*
* GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
* segment has not changed since the basebackup, because in order to
* modify it, we would have had to download it already. And once
* downloaded, we never evict SLRU segments from local disk.
* GetRedoStartLsn() returns LSN of basebackup.
* We need to download SLRU segments only once after node startup,
* then SLRUs are maintained locally.
*/
not_modified_since = GetRedoStartLsn();
request_lsn = GetRedoStartLsn();
request_lsn = nm_adjust_lsn(request_lsn);
SlruKind kind;
if (STRPREFIX(path, "pg_xact"))
@@ -2935,8 +2827,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
NeonResponse *resp;
NeonGetSlruSegmentRequest request = {
.req.tag = T_NeonGetSlruSegmentRequest,
.req.latest = false,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.kind = kind,
.segno = segno
@@ -3064,9 +2956,6 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
{
BlockNumber relsize;
/* This is only used in WAL replay */
Assert(RecoveryInProgress());
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{
@@ -3085,13 +2974,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
* This length is later reused when we open the smgr to read the
* block, which is fine and expected.
*/
NeonResponse *response;
NeonNblocksResponse *nbresponse;
NeonNblocksRequest request = {
.req = (NeonRequest) {
.tag = T_NeonNblocksRequest,
.lsn = end_recptr,
.not_modified_since = end_recptr,
.latest = false,
.tag = T_NeonNblocksRequest,
},
.rinfo = rinfo,
.forknum = forknum,

View File

@@ -7,7 +7,7 @@ OBJS = \
neontest.o
EXTENSION = neon_test_utils
DATA = neon_test_utils--1.1.sql
DATA = neon_test_utils--1.0.sql
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
PG_CONFIG = pg_config

View File

@@ -31,12 +31,12 @@ AS 'MODULE_PATHNAME', 'clear_buffer_cache'
LANGUAGE C STRICT
PARALLEL UNSAFE;
CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
RETURNS bytea
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
LANGUAGE C PARALLEL UNSAFE;
CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
RETURNS bytea
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
LANGUAGE C PARALLEL UNSAFE;

View File

@@ -1,6 +1,6 @@
# neon_test_utils extension
comment = 'helpers for neon testing and debugging'
default_version = '1.1'
default_version = '1.0'
module_pathname = '$libdir/neon_test_utils'
relocatable = true
trusted = true

View File

@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
*/
#if PG_MAJORVERSION_NUM < 16
typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
XLogRecPtr request_lsn, bool request_latest, char *buffer);
#else
typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
XLogRecPtr request_lsn, bool request_latest, void *buffer);
#endif
static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -299,11 +299,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
text *forkname;
uint32 blkno;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
if (PG_NARGS() != 5)
elog(ERROR, "unexpected number of arguments in SQL function signature");
bool request_latest = PG_ARGISNULL(3);
uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
PG_RETURN_NULL();
@@ -312,9 +309,6 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
forkname = PG_GETARG_TEXT_PP(1);
blkno = PG_GETARG_UINT32(2);
request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -367,7 +361,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
raw_page_data = VARDATA(raw_page);
neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data);
relation_close(rel, AccessShareLock);
@@ -386,9 +380,6 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
{
char *raw_page_data;
if (PG_NARGS() != 7)
elog(ERROR, "unexpected number of arguments in SQL function signature");
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -412,20 +403,18 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
};
ForkNumber forknum = PG_GETARG_UINT32(3);
uint32 blkno = PG_GETARG_UINT32(4);
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
bool request_latest = PG_ARGISNULL(5);
uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
/* Initialize buffer to copy to */
bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
raw_page_data = VARDATA(raw_page);
neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data);
PG_RETURN_BYTEA_P(raw_page);
}
}

View File

@@ -279,7 +279,7 @@ async fn handle_client(
// doesn't yet matter as pg-sni-router doesn't report analytics logs
ctx.set_success();
ctx.log_connect();
ctx.log();
// Starting from here we only proxy the client's traffic.
info!("performing the proxy pass...");

View File

@@ -403,43 +403,27 @@ async fn main() -> anyhow::Result<()> {
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
client_tasks.spawn(usage_metrics::task_backup(
&metrics_config.backup_metric_collection_config,
cancellation_token.clone(),
cancellation_token,
));
}
if let auth::BackendType::Console(api, _) = &config.auth_backend {
if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
match (redis_notifications_client, regional_redis_client.clone()) {
(None, None) => {}
(client1, client2) => {
let cache = api.caches.project_info.clone();
if let Some(client) = client1 {
maintenance_tasks.spawn(notifications::task_main(
client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
}
if let Some(client) = client2 {
maintenance_tasks.spawn(notifications::task_main(
client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
}
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
}
if let Some(redis_notifications_client) = redis_notifications_client {
let cache = api.caches.project_info.clone();
maintenance_tasks.spawn(notifications::task_main(
redis_notifications_client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
}
if let Some(regional_redis_client) = regional_redis_client {
let cache = api.caches.endpoints_cache.clone();
let con = regional_redis_client;
let span = tracing::info_span!("endpoints_cache");
maintenance_tasks.spawn(
async move { cache.do_read(con, cancellation_token.clone()).await }
.instrument(span),
);
maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
}
}
}

View File

@@ -4,7 +4,6 @@ use std::{
atomic::{AtomicBool, Ordering},
Arc,
},
time::Duration,
};
use dashmap::DashSet;
@@ -14,7 +13,6 @@ use redis::{
};
use serde::Deserialize;
use tokio::sync::Mutex;
use tokio_util::sync::CancellationToken;
use tracing::info;
use crate::{
@@ -113,22 +111,16 @@ impl EndpointsCache {
pub async fn do_read(
&self,
mut con: ConnectionWithCredentialsProvider,
cancellation_token: CancellationToken,
) -> anyhow::Result<Infallible> {
let mut last_id = "0-0".to_string();
loop {
self.ready.store(false, Ordering::Release);
if let Err(e) = con.connect().await {
tracing::error!("error connecting to redis: {:?}", e);
self.ready.store(false, Ordering::Release);
continue;
}
if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
tracing::error!("error reading from redis: {:?}", e);
self.ready.store(false, Ordering::Release);
}
if cancellation_token.is_cancelled() {
info!("cancellation token is cancelled, exiting");
tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
// 1 week.
}
tokio::time::sleep(self.config.retry_interval).await;
}

View File

@@ -260,9 +260,7 @@ impl ConnCfg {
aux: MetricsAuxInfo,
timeout: Duration,
) -> Result<PostgresConnection, ConnectionError> {
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
drop(pause);
let tls_connector = native_tls::TlsConnector::builder()
.danger_accept_invalid_certs(allow_self_signed_compute)
@@ -272,9 +270,7 @@ impl ConnCfg {
let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
// connect_raw() will not use TLS if sslmode is "disable"
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (client, connection) = self.0.connect_raw(stream, tls).await?;
drop(pause);
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
let stream = connection.stream.into_inner();

View File

@@ -533,13 +533,13 @@ pub struct RetryConfig {
impl RetryConfig {
/// Default options for RetryConfig.
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
/// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
"num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
/// Cplane has timeout of 60s on each request. 8m7s in total.
"num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
/// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
/// Cplane has timeout of 60s on each request.
pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
"num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
"num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
/// Parse retry options passed via cmdline.
/// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].

View File

@@ -20,8 +20,7 @@ use self::parquet::RequestData;
pub mod parquet;
pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
/// Context data for a single request to connect to a database.
///
@@ -50,12 +49,9 @@ pub struct RequestMonitoring {
// extra
// This sender is here to keep the request monitoring channel open while requests are taking place.
sender: Option<mpsc::UnboundedSender<RequestData>>,
// This sender is only used to log the length of session in case of success.
disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
pub latency_timer: LatencyTimer,
// Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
rejected: Option<bool>,
disconnect_timestamp: Option<chrono::DateTime<Utc>>,
}
#[derive(Clone, Debug)]
@@ -104,9 +100,7 @@ impl RequestMonitoring {
cold_start_info: ColdStartInfo::Unknown,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol),
disconnect_timestamp: None,
}
}
@@ -196,7 +190,11 @@ impl RequestMonitoring {
self.success = true;
}
pub fn log_connect(&mut self) {
pub fn log(self) {}
}
impl Drop for RequestMonitoring {
fn drop(&mut self) {
let outcome = if self.success {
ConnectOutcome::Success
} else {
@@ -228,23 +226,4 @@ impl RequestMonitoring {
let _: Result<(), _> = tx.send(RequestData::from(&*self));
}
}
fn log_disconnect(&mut self) {
// If we are here, it's guaranteed that the user successfully connected to the endpoint.
// Here we log the length of the session.
self.disconnect_timestamp = Some(Utc::now());
if let Some(tx) = self.disconnect_sender.take() {
let _: Result<(), _> = tx.send(RequestData::from(&*self));
}
}
}
impl Drop for RequestMonitoring {
fn drop(&mut self) {
if self.sender.is_some() {
self.log_connect();
} else {
self.log_disconnect();
}
}
}

View File

@@ -19,10 +19,7 @@ use tokio_util::sync::CancellationToken;
use tracing::{debug, info, Span};
use utils::backoff;
use crate::{
config::{remote_storage_from_toml, OptRemoteStorageConfig},
context::LOG_CHAN_DISCONNECT,
};
use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
use super::{RequestMonitoring, LOG_CHAN};
@@ -34,9 +31,6 @@ pub struct ParquetUploadArgs {
#[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
parquet_upload_remote_storage: OptRemoteStorageConfig,
#[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
/// How many rows to include in a row group
#[clap(long, default_value_t = 8192)]
parquet_upload_row_group_size: usize,
@@ -97,8 +91,6 @@ pub struct RequestData {
/// Tracks time from session start (HTTP request/libpq TCP handshake)
/// Through to success/failure
duration_us: u64,
/// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`.
disconnect_timestamp: Option<chrono::NaiveDateTime>,
}
impl From<&RequestMonitoring> for RequestData {
@@ -128,7 +120,6 @@ impl From<&RequestMonitoring> for RequestData {
.elapsed()
.unwrap_or_default()
.as_micros() as u64, // 584 millenia... good enough
disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()),
}
}
}
@@ -150,9 +141,8 @@ pub async fn worker(
LOG_CHAN.set(tx.downgrade()).unwrap();
// setup row stream that will close on cancellation
let cancellation_token2 = cancellation_token.clone();
tokio::spawn(async move {
cancellation_token2.cancelled().await;
cancellation_token.cancelled().await;
// dropping this sender will cause the channel to close only once
// all the remaining inflight requests have been completed.
drop(tx);
@@ -177,38 +167,9 @@ pub async fn worker(
test_remote_failures: 0,
};
// TODO(anna): consider moving this to a separate function.
if let Some(disconnect_events_storage_config) =
config.parquet_upload_disconnect_events_remote_storage
{
let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel();
LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap();
// setup row stream that will close on cancellation
tokio::spawn(async move {
cancellation_token.cancelled().await;
// dropping this sender will cause the channel to close only once
// all the remaining inflight requests have been completed.
drop(tx_disconnect);
});
let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
let rx_disconnect = rx_disconnect.map(RequestData::from);
let storage_disconnect =
GenericRemoteStorage::from_config(&disconnect_events_storage_config)
.context("remote storage for disconnect events init")?;
let parquet_config_disconnect = parquet_config.clone();
tokio::try_join!(
worker_inner(storage, rx, parquet_config),
worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
)
.map(|_| ())
} else {
worker_inner(storage, rx, parquet_config).await
}
worker_inner(storage, rx, parquet_config).await
}
#[derive(Clone, Debug)]
struct ParquetConfig {
propeties: WriterPropertiesPtr,
rows_per_group: usize,
@@ -452,7 +413,6 @@ mod tests {
)
.unwrap(),
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
upload_storage_class: None,
}),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
})
@@ -491,7 +451,6 @@ mod tests {
success: rng.gen(),
cold_start_info: "no",
duration_us: rng.gen_range(0..30_000_000),
disconnect_timestamp: None,
}
}
@@ -560,15 +519,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1315008, 3, 6000),
(1315001, 3, 6000),
(1315061, 3, 6000),
(1315018, 3, 6000),
(1315148, 3, 6000),
(1314990, 3, 6000),
(1314782, 3, 6000),
(1315018, 3, 6000),
(438575, 1, 2000)
(1314385, 3, 6000),
(1314378, 3, 6000),
(1314438, 3, 6000),
(1314395, 3, 6000),
(1314525, 3, 6000),
(1314367, 3, 6000),
(1314159, 3, 6000),
(1314395, 3, 6000),
(438352, 1, 2000)
]
);
@@ -598,11 +557,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1221738, 5, 10000),
(1227888, 5, 10000),
(1229682, 5, 10000),
(1229044, 5, 10000),
(1220322, 5, 10000)
(1220633, 5, 10000),
(1226783, 5, 10000),
(1228577, 5, 10000),
(1227939, 5, 10000),
(1219217, 5, 10000)
]
);
@@ -634,11 +593,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1207385, 5, 10000),
(1207116, 5, 10000),
(1207409, 5, 10000),
(1207397, 5, 10000),
(1207652, 5, 10000)
(1206280, 5, 10000),
(1206011, 5, 10000),
(1206304, 5, 10000),
(1206292, 5, 10000),
(1206547, 5, 10000)
]
);
@@ -663,15 +622,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1315008, 3, 6000),
(1315001, 3, 6000),
(1315061, 3, 6000),
(1315018, 3, 6000),
(1315148, 3, 6000),
(1314990, 3, 6000),
(1314782, 3, 6000),
(1315018, 3, 6000),
(438575, 1, 2000)
(1314385, 3, 6000),
(1314378, 3, 6000),
(1314438, 3, 6000),
(1314395, 3, 6000),
(1314525, 3, 6000),
(1314367, 3, 6000),
(1314159, 3, 6000),
(1314395, 3, 6000),
(438352, 1, 2000)
]
);
@@ -708,7 +667,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
[(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
);
tmpdir.close().unwrap();

View File

@@ -284,8 +284,6 @@ pub struct ComputeConnectionLatencyGroup {
pub enum LatencyExclusions {
Client,
ClientAndCplane,
ClientCplaneCompute,
ClientCplaneComputeRetry,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
@@ -354,7 +352,6 @@ pub enum Waiting {
Cplane,
Client,
Compute,
RetryTimeout,
}
#[derive(Default)]
@@ -362,7 +359,6 @@ struct Accumulated {
cplane: time::Duration,
client: time::Duration,
compute: time::Duration,
retry: time::Duration,
}
pub struct LatencyTimer {
@@ -425,7 +421,6 @@ impl Drop for LatencyTimerPause<'_> {
Waiting::Cplane => self.timer.accumulated.cplane += dur,
Waiting::Client => self.timer.accumulated.client += dur,
Waiting::Compute => self.timer.accumulated.compute += dur,
Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
}
}
}
@@ -469,34 +464,6 @@ impl Drop for LatencyTimer {
},
duration.saturating_sub(accumulated_total).as_secs_f64(),
);
// Exclude client cplane, compue communication from the accumulated time.
let accumulated_total =
self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
metric.observe(
ComputeConnectionLatencyGroup {
protocol: self.protocol,
cold_start_info: self.cold_start_info,
outcome: self.outcome,
excluded: LatencyExclusions::ClientCplaneCompute,
},
duration.saturating_sub(accumulated_total).as_secs_f64(),
);
// Exclude client cplane, compue, retry communication from the accumulated time.
let accumulated_total = self.accumulated.client
+ self.accumulated.cplane
+ self.accumulated.compute
+ self.accumulated.retry;
metric.observe(
ComputeConnectionLatencyGroup {
protocol: self.protocol,
cold_start_info: self.cold_start_info,
outcome: self.outcome,
excluded: LatencyExclusions::ClientCplaneComputeRetry,
},
duration.saturating_sub(accumulated_total).as_secs_f64(),
);
}
}

View File

@@ -1,26 +1,42 @@
//! Proxy Protocol V2 implementation
use std::{
future::{poll_fn, Future},
io,
net::SocketAddr,
pin::Pin,
task::{Context, Poll},
pin::{pin, Pin},
task::{ready, Context, Poll},
};
use bytes::BytesMut;
use bytes::{Buf, BytesMut};
use hyper::server::conn::AddrIncoming;
use pin_project_lite::pin_project;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
pub struct ProxyProtocolAccept {
pub incoming: AddrIncoming,
pub protocol: &'static str,
}
pin_project! {
/// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
pub struct ChainRW<T> {
pub struct WithClientIp<T> {
#[pin]
pub inner: T,
buf: BytesMut,
tlv_bytes: u16,
state: ProxyParse,
}
}
impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
#[derive(Clone, PartialEq, Debug)]
enum ProxyParse {
NotStarted,
Finished(SocketAddr),
None,
}
impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
#[inline]
fn poll_write(
self: Pin<&mut Self>,
@@ -55,174 +71,267 @@ impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
}
}
impl<T> WithClientIp<T> {
pub fn new(inner: T) -> Self {
WithClientIp {
inner,
buf: BytesMut::with_capacity(128),
tlv_bytes: 0,
state: ProxyParse::NotStarted,
}
}
pub fn client_addr(&self) -> Option<SocketAddr> {
match self.state {
ProxyParse::Finished(socket) => Some(socket),
_ => None,
}
}
}
impl<T: AsyncRead + Unpin> WithClientIp<T> {
pub async fn wait_for_addr(&mut self) -> io::Result<Option<SocketAddr>> {
match self.state {
ProxyParse::NotStarted => {
let mut pin = Pin::new(&mut *self);
let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?;
match addr {
Some(addr) => self.state = ProxyParse::Finished(addr),
None => self.state = ProxyParse::None,
}
Ok(addr)
}
ProxyParse::Finished(addr) => Ok(Some(addr)),
ProxyParse::None => Ok(None),
}
}
}
/// Proxy Protocol Version 2 Header
const HEADER: [u8; 12] = [
0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
];
pub async fn read_proxy_protocol<T: AsyncRead + Unpin>(
mut read: T,
) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
let mut buf = BytesMut::with_capacity(128);
while buf.len() < 16 {
let bytes_read = read.read_buf(&mut buf).await?;
impl<T: AsyncRead> WithClientIp<T> {
/// implementation of <https://www.haproxy.org/download/2.4/doc/proxy-protocol.txt>
/// Version 2 (Binary Format)
fn poll_client_ip(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<io::Result<Option<SocketAddr>>> {
// The binary header format starts with a constant 12 bytes block containing the protocol signature :
// \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A
while self.buf.len() < 16 {
let mut this = self.as_mut().project();
let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?;
// exit for bad header
let len = usize::min(buf.len(), HEADER.len());
if buf[..len] != HEADER[..len] {
return Ok((ChainRW { inner: read, buf }, None));
// exit for bad header
let len = usize::min(self.buf.len(), HEADER.len());
if self.buf[..len] != HEADER[..len] {
return Poll::Ready(Ok(None));
}
// if no more bytes available then exit
if ready!(bytes_read) == 0 {
return Poll::Ready(Ok(None));
};
}
// if no more bytes available then exit
if bytes_read == 0 {
return Ok((ChainRW { inner: read, buf }, None));
};
}
let header = buf.split_to(16);
// The next byte (the 13th one) is the protocol version and command.
// The highest four bits contains the version. As of this specification, it must
// always be sent as \x2 and the receiver must only accept this value.
let vc = header[12];
let version = vc >> 4;
let command = vc & 0b1111;
if version != 2 {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol version. expected version 2",
));
}
match command {
// the connection was established on purpose by the proxy
// without being relayed. The connection endpoints are the sender and the
// receiver. Such connections exist when the proxy sends health-checks to the
// server. The receiver must accept this connection as valid and must use the
// real connection endpoints and discard the protocol block including the
// family which is ignored.
0 => {}
// the connection was established on behalf of another node,
// and reflects the original connection endpoints. The receiver must then use
// the information provided in the protocol block to get original the address.
1 => {}
// other values are unassigned and must not be emitted by senders. Receivers
// must drop connections presenting unexpected values here.
_ => {
return Err(io::Error::new(
// The next byte (the 13th one) is the protocol version and command.
// The highest four bits contains the version. As of this specification, it must
// always be sent as \x2 and the receiver must only accept this value.
let vc = self.buf[12];
let version = vc >> 4;
let command = vc & 0b1111;
if version != 2 {
return Poll::Ready(Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol command. expected local (0) or proxy (1)",
))
"invalid proxy protocol version. expected version 2",
)));
}
};
match command {
// the connection was established on purpose by the proxy
// without being relayed. The connection endpoints are the sender and the
// receiver. Such connections exist when the proxy sends health-checks to the
// server. The receiver must accept this connection as valid and must use the
// real connection endpoints and discard the protocol block including the
// family which is ignored.
0 => {}
// the connection was established on behalf of another node,
// and reflects the original connection endpoints. The receiver must then use
// the information provided in the protocol block to get original the address.
1 => {}
// other values are unassigned and must not be emitted by senders. Receivers
// must drop connections presenting unexpected values here.
_ => {
return Poll::Ready(Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol command. expected local (0) or proxy (1)",
)))
}
};
// The 14th byte contains the transport protocol and address family. The highest 4
// bits contain the address family, the lowest 4 bits contain the protocol.
let ft = header[13];
let address_length = match ft {
// - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
// - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
0x11 | 0x12 => 12,
// - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
// - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
0x21 | 0x22 => 36,
// unspecified or unix stream. ignore the addresses
_ => 0,
};
// The 14th byte contains the transport protocol and address family. The highest 4
// bits contain the address family, the lowest 4 bits contain the protocol.
let ft = self.buf[13];
let address_length = match ft {
// - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
// - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
0x11 | 0x12 => 12,
// - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
// - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
0x21 | 0x22 => 36,
// unspecified or unix stream. ignore the addresses
_ => 0,
};
// The 15th and 16th bytes is the address length in bytes in network endian order.
// It is used so that the receiver knows how many address bytes to skip even when
// it does not implement the presented protocol. Thus the length of the protocol
// header in bytes is always exactly 16 + this value. When a sender presents a
// LOCAL connection, it should not present any address so it sets this field to
// zero. Receivers MUST always consider this field to skip the appropriate number
// of bytes and must not assume zero is presented for LOCAL connections. When a
// receiver accepts an incoming connection showing an UNSPEC address family or
// protocol, it may or may not decide to log the address information if present.
let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap());
if remaining_length < address_length {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol length. not enough to fit requested IP addresses",
));
}
drop(header);
while buf.len() < remaining_length as usize {
if read.read_buf(&mut buf).await? == 0 {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"stream closed while waiting for proxy protocol addresses",
));
// The 15th and 16th bytes is the address length in bytes in network endian order.
// It is used so that the receiver knows how many address bytes to skip even when
// it does not implement the presented protocol. Thus the length of the protocol
// header in bytes is always exactly 16 + this value. When a sender presents a
// LOCAL connection, it should not present any address so it sets this field to
// zero. Receivers MUST always consider this field to skip the appropriate number
// of bytes and must not assume zero is presented for LOCAL connections. When a
// receiver accepts an incoming connection showing an UNSPEC address family or
// protocol, it may or may not decide to log the address information if present.
let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap());
if remaining_length < address_length {
return Poll::Ready(Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol length. not enough to fit requested IP addresses",
)));
}
while self.buf.len() < 16 + address_length as usize {
let mut this = self.as_mut().project();
if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 {
return Poll::Ready(Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"stream closed while waiting for proxy protocol addresses",
)));
}
}
let this = self.as_mut().project();
// we are sure this is a proxy protocol v2 entry and we have read all the bytes we need
// discard the header we have parsed
this.buf.advance(16);
// Starting from the 17th byte, addresses are presented in network byte order.
// The address order is always the same :
// - source layer 3 address in network byte order
// - destination layer 3 address in network byte order
// - source layer 4 address if any, in network byte order (port)
// - destination layer 4 address if any, in network byte order (port)
let addresses = this.buf.split_to(address_length as usize);
let socket = match address_length {
12 => {
let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
Some(SocketAddr::from((src_addr, src_port)))
}
36 => {
let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
Some(SocketAddr::from((src_addr, src_port)))
}
_ => None,
};
*this.tlv_bytes = remaining_length - address_length;
self.as_mut().skip_tlv_inner();
Poll::Ready(Ok(socket))
}
// Starting from the 17th byte, addresses are presented in network byte order.
// The address order is always the same :
// - source layer 3 address in network byte order
// - destination layer 3 address in network byte order
// - source layer 4 address if any, in network byte order (port)
// - destination layer 4 address if any, in network byte order (port)
let addresses = buf.split_to(remaining_length as usize);
let socket = match address_length {
12 => {
let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
Some(SocketAddr::from((src_addr, src_port)))
#[cold]
fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
let ip = ready!(self.as_mut().poll_client_ip(cx)?);
match ip {
Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x),
None => *self.as_mut().project().state = ProxyParse::None,
}
36 => {
let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
Some(SocketAddr::from((src_addr, src_port)))
}
_ => None,
};
Poll::Ready(Ok(()))
}
Ok((ChainRW { inner: read, buf }, socket))
#[cold]
fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
let mut this = self.as_mut().project();
// we know that this.buf is empty
debug_assert_eq!(this.buf.len(), 0);
this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize);
ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?);
self.skip_tlv_inner();
Poll::Ready(Ok(()))
}
fn skip_tlv_inner(self: Pin<&mut Self>) {
let tlv_bytes_read = match u16::try_from(self.buf.len()) {
// we read more than u16::MAX therefore we must have read the full tlv_bytes
Err(_) => self.tlv_bytes,
// we might not have read the full tlv bytes yet
Ok(n) => u16::min(n, self.tlv_bytes),
};
let this = self.project();
*this.tlv_bytes -= tlv_bytes_read;
this.buf.advance(tlv_bytes_read as usize);
}
}
impl<T: AsyncRead> AsyncRead for ChainRW<T> {
impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
#[inline]
fn poll_read(
self: Pin<&mut Self>,
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
if self.buf.is_empty() {
self.project().inner.poll_read(cx, buf)
} else {
self.read_from_buf(buf)
}
}
}
// I'm assuming these 3 comparisons will be easy to branch predict.
// especially with the cold attributes
// which should make this read wrapper almost invisible
if let ProxyParse::NotStarted = self.state {
ready!(self.as_mut().read_ip(cx)?);
}
while self.tlv_bytes > 0 {
ready!(self.as_mut().skip_tlv(cx)?)
}
impl<T: AsyncRead> ChainRW<T> {
#[cold]
fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll<io::Result<()>> {
debug_assert!(!self.buf.is_empty());
let this = self.project();
let write = usize::min(this.buf.len(), buf.remaining());
let slice = this.buf.split_to(write).freeze();
buf.put_slice(&slice);
// reset the allocation so it can be freed
if this.buf.is_empty() {
*this.buf = BytesMut::new();
}
this.inner.poll_read(cx, buf)
} else {
// we know that tlv_bytes is 0
debug_assert_eq!(*this.tlv_bytes, 0);
Poll::Ready(Ok(()))
let write = usize::min(this.buf.len(), buf.remaining());
let slice = this.buf.split_to(write).freeze();
buf.put_slice(&slice);
// reset the allocation so it can be freed
if this.buf.is_empty() {
*this.buf = BytesMut::new();
}
Poll::Ready(Ok(()))
}
}
}
#[cfg(test)]
mod tests {
use std::pin::pin;
use tokio::io::AsyncReadExt;
use crate::protocol2::read_proxy_protocol;
use crate::protocol2::{ProxyParse, WithClientIp};
#[tokio::test]
async fn test_ipv4() {
@@ -244,15 +353,16 @@ mod tests {
let extra_data = [0x55; 256];
let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into()));
assert_eq!(
read.state,
ProxyParse::Finished(([127, 0, 0, 1], 65535).into())
);
}
#[tokio::test]
@@ -275,17 +385,17 @@ mod tests {
let extra_data = [0x55; 256];
let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
assert_eq!(
addr,
Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into())
read.state,
ProxyParse::Finished(
([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
)
);
}
@@ -293,24 +403,24 @@ mod tests {
async fn test_invalid() {
let data = [0x55; 256];
let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
let mut read = pin!(WithClientIp::new(data.as_slice()));
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, data);
assert_eq!(addr, None);
assert_eq!(read.state, ProxyParse::None);
}
#[tokio::test]
async fn test_short() {
let data = [0x55; 10];
let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
let mut read = pin!(WithClientIp::new(data.as_slice()));
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, data);
assert_eq!(addr, None);
assert_eq!(read.state, ProxyParse::None);
}
#[tokio::test]
@@ -336,14 +446,15 @@ mod tests {
let extra_data = [0xaa; 256];
let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into()));
assert_eq!(
read.state,
ProxyParse::Finished(([55, 56, 57, 58], 65535).into())
);
}
}

View File

@@ -17,7 +17,7 @@ use crate::{
context::RequestMonitoring,
error::ReportableError,
metrics::{Metrics, NumClientConnectionsGuard},
protocol2::read_proxy_protocol,
protocol2::WithClientIp,
proxy::handshake::{handshake, HandshakeData},
stream::{PqStream, Stream},
EndpointCacheKey,
@@ -88,18 +88,20 @@ pub async fn task_main(
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
connections.spawn(async move {
let (socket, peer_addr) = match read_proxy_protocol(socket).await{
Ok((socket, Some(addr))) => (socket, addr.ip()),
let mut socket = WithClientIp::new(socket);
let mut peer_addr = peer_addr.ip();
match socket.wait_for_addr().await {
Ok(Some(addr)) => peer_addr = addr.ip(),
Err(e) => {
error!("per-client task finished with an error: {e:#}");
return;
}
Ok((_socket, None)) if config.require_client_ip => {
Ok(None) if config.require_client_ip => {
error!("missing required client IP");
return;
}
Ok((socket, None)) => (socket, peer_addr.ip())
};
Ok(None) => {}
}
match socket.inner.set_nodelay(true) {
Ok(()) => {},
@@ -132,14 +134,16 @@ pub async fn task_main(
Err(e) => {
// todo: log and push to ctx the error kind
ctx.set_error_kind(e.get_error_kind());
ctx.log();
error!(parent: &span, "per-client task finished with an error: {e:#}");
}
Ok(None) => {
ctx.set_success();
ctx.log();
}
Ok(Some(p)) => {
ctx.set_success();
ctx.log_connect();
ctx.log();
match p.proxy_pass().instrument(span.clone()).await {
Ok(()) => {}
Err(e) => {

View File

@@ -133,17 +133,10 @@ where
error!(error = ?err, "could not connect to compute node");
let node_info = if !node_info.cached() || !err.should_retry_database_address() {
let node_info = if !node_info.cached() {
// If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
// Do not need to retrieve a new node_info, just return the old one.
if !err.should_retry(num_retries, connect_to_compute_retry_config) {
Metrics::get().proxy.retries_metric.observe(
RetriesMetricGroup {
outcome: ConnectOutcome::Failed,
retry_type,
},
num_retries.into(),
);
return Err(err.into());
}
node_info
@@ -201,10 +194,6 @@ where
let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
num_retries += 1;
let pause = ctx
.latency_timer
.pause(crate::metrics::Waiting::RetryTimeout);
time::sleep(wait_duration).await;
drop(pause);
}
}

View File

@@ -10,9 +10,6 @@ pub trait ShouldRetry {
err => err.could_retry(),
}
}
fn should_retry_database_address(&self) -> bool {
true
}
}
impl ShouldRetry for io::Error {
@@ -36,21 +33,6 @@ impl ShouldRetry for tokio_postgres::error::DbError {
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
)
}
fn should_retry_database_address(&self) -> bool {
use tokio_postgres::error::SqlState;
// Here are errors that happens after the user successfully authenticated to the database.
// TODO: there are pgbouncer errors that should be retried, but they are not listed here.
!matches!(
self.code(),
&SqlState::TOO_MANY_CONNECTIONS
| &SqlState::OUT_OF_MEMORY
| &SqlState::SYNTAX_ERROR
| &SqlState::T_R_SERIALIZATION_FAILURE
| &SqlState::INVALID_CATALOG_NAME
| &SqlState::INVALID_SCHEMA_NAME
| &SqlState::INVALID_PARAMETER_VALUE
)
}
}
impl ShouldRetry for tokio_postgres::Error {
@@ -63,15 +45,6 @@ impl ShouldRetry for tokio_postgres::Error {
false
}
}
fn should_retry_database_address(&self) -> bool {
if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
io::Error::should_retry_database_address(io_err)
} else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
tokio_postgres::error::DbError::should_retry_database_address(db_err)
} else {
true
}
}
}
impl ShouldRetry for compute::ConnectionError {
@@ -82,13 +55,6 @@ impl ShouldRetry for compute::ConnectionError {
_ => false,
}
}
fn should_retry_database_address(&self) -> bool {
match self {
compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
_ => true,
}
}
}
pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {

View File

@@ -174,7 +174,7 @@ async fn dummy_proxy(
tls: Option<TlsConfig>,
auth: impl TestAuth + Send,
) -> anyhow::Result<()> {
let (client, _) = read_proxy_protocol(client).await?;
let client = WithClientIp::new(client);
let mut stream = match handshake(client, tls.as_ref(), false).await? {
HandshakeData::Startup(stream, _) => stream,
HandshakeData::Cancel(_) => bail!("cancellation not supported"),

View File

@@ -54,11 +54,7 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
let wait_duration = retry_after(*num_retries, config);
*num_retries += 1;
let pause = ctx
.latency_timer
.pause(crate::metrics::Waiting::RetryTimeout);
tokio::time::sleep(wait_duration).await;
drop(pause);
}
}

View File

@@ -33,7 +33,7 @@ use crate::cancellation::CancellationHandlerMain;
use crate::config::ProxyConfig;
use crate::context::RequestMonitoring;
use crate::metrics::Metrics;
use crate::protocol2::read_proxy_protocol;
use crate::protocol2::WithClientIp;
use crate::proxy::run_until_cancelled;
use crate::serverless::backend::PoolingBackend;
use crate::serverless::http_util::{api_error_into_response, json_response};
@@ -158,8 +158,9 @@ async fn connection_handler(
.guard(crate::metrics::Protocol::Http);
// handle PROXY protocol
let (conn, peer) = match read_proxy_protocol(conn).await {
Ok(c) => c,
let mut conn = WithClientIp::new(conn);
let peer = match conn.wait_for_addr().await {
Ok(peer) => peer,
Err(e) => {
tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
return;

View File

@@ -179,9 +179,7 @@ impl ConnectMechanism for TokioMechanism {
.dbname(&self.conn_info.dbname)
.connect_timeout(timeout);
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
drop(pause);
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
Ok(poll_client(

View File

@@ -156,15 +156,17 @@ pub async fn serve_websocket(
Err(e) => {
// todo: log and push to ctx the error kind
ctx.set_error_kind(e.get_error_kind());
ctx.log();
Err(e.into())
}
Ok(None) => {
ctx.set_success();
ctx.log();
Ok(())
}
Ok(Some(p)) => {
ctx.set_success();
ctx.log_connect();
ctx.log();
p.proxy_pass().await
}
}

View File

@@ -22,15 +22,9 @@ serde_with.workspace = true
workspace_hack.workspace = true
utils.workspace = true
async-stream.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres_ffi.workspace = true
tokio-stream.workspace = true
tokio-postgres.workspace = true
tokio-util = { workspace = true }
futures-util.workspace = true
itertools.workspace = true
camino.workspace = true
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }

View File

@@ -67,12 +67,10 @@ the purge command will log all the keys that it would have deleted.
#### `scan-metadata`
Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
Errors are logged to stderr and summary to stdout.
Walk objects in a pageserver S3 bucket, and report statistics on the contents.
For pageserver:
```
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
Timelines: 31106
With errors: 3
@@ -84,10 +82,6 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
```
For safekeepers, dump_db_connstr and dump_db_table must be
specified; they should point to table with debug dump which will be used
to list timelines and find their backup and start LSNs.
## Cleaning up running pageservers
If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.

View File

@@ -1,13 +1,11 @@
use chrono::{DateTime, Utc};
use futures::Future;
use hex::FromHex;
use std::time::Duration;
use chrono::{DateTime, Utc};
use hex::FromHex;
use reqwest::{header, Client, StatusCode, Url};
use serde::Deserialize;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use utils::backoff;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -139,7 +137,7 @@ pub struct ProjectData {
pub region_id: String,
pub platform_id: String,
pub user_id: String,
pub pageserver_id: Option<u64>,
pub pageserver_id: u64,
#[serde(deserialize_with = "from_nullable_id")]
pub tenant: TenantId,
pub safekeepers: Vec<SafekeeperData>,
@@ -157,7 +155,7 @@ pub struct ProjectData {
pub maintenance_set: Option<String>,
}
#[derive(Debug, Clone, serde::Deserialize)]
#[derive(Debug, serde::Deserialize)]
pub struct BranchData {
pub id: BranchId,
pub created_at: DateTime<Utc>,
@@ -212,39 +210,30 @@ impl CloudAdminApiClient {
.await
.expect("Semaphore is not closed");
let response = CloudAdminApiClient::with_retries(
|| async {
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("tenant_id", tenant_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response: AdminApiResponse<Vec<ProjectData>> =
response.json().await.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::BodyRead(e),
)
})?;
Ok(response)
},
"find_tenant_project",
)
.await?;
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("tenant_id", tenant_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::BodyRead(e),
)
})?;
match response.data.len() {
0 => Ok(None),
1 => Ok(Some(
@@ -272,34 +261,42 @@ impl CloudAdminApiClient {
const PAGINATION_LIMIT: usize = 512;
let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
loop {
let response_bytes = CloudAdminApiClient::with_retries(
|| async {
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("show_deleted", "false".to_string()),
("limit", format!("{PAGINATION_LIMIT}")),
("offset", format!("{pagination_offset}")),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"List active projects".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("show_deleted", "false".to_string()),
("limit", format!("{PAGINATION_LIMIT}")),
("offset", format!("{pagination_offset}")),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"List active projects".to_string(),
ErrorKind::RequestSend(e),
)
})?;
response.bytes().await.map_err(|e| {
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
})
},
"list_projects",
)
.await?;
match response.status() {
StatusCode::OK => {}
StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
tokio::time::sleep(Duration::from_millis(500)).await;
continue;
}
_status => {
return Err(Error::new(
"List active projects".to_string(),
ErrorKind::ResponseStatus(response.status()),
))
}
}
let response_bytes = response.bytes().await.map_err(|e| {
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
})?;
let decode_result =
serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -330,7 +327,6 @@ impl CloudAdminApiClient {
pub async fn find_timeline_branch(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Option<BranchData>, Error> {
let _permit = self
@@ -339,61 +335,43 @@ impl CloudAdminApiClient {
.await
.expect("Semaphore is not closed");
let response = CloudAdminApiClient::with_retries(
|| async {
let response = self
.http_client
.get(self.append_url("/branches"))
.query(&[
("timeline_id", timeline_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response = self
.http_client
.get(self.append_url("/branches"))
.query(&[
("timeline_id", timeline_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response: AdminApiResponse<Vec<BranchData>> =
response.json().await.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::BodyRead(e),
)
})?;
Ok(response)
},
"find_timeline_branch",
)
.await?;
let mut branches: Vec<BranchData> = response.data.into_iter().collect();
// Normally timeline_id is unique. However, we do have at least one case
// of the same timeline_id in two different projects, apparently after
// manual recovery. So always recheck project_id (discovered through
// tenant_id).
let project_data = match self.find_tenant_project(tenant_id).await? {
Some(pd) => pd,
None => return Ok(None),
};
branches.retain(|b| b.project_id == project_data.id);
if branches.len() < 2 {
Ok(branches.first().cloned())
} else {
Err(Error::new(
format!(
"Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
tenant_id,
timeline_id,
branches.len()
),
let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::BodyRead(e),
)
})?;
match response.data.len() {
0 => Ok(None),
1 => Ok(Some(
response
.data
.into_iter()
.next()
.expect("Should have exactly one element"),
)),
too_many => Err(Error::new(
format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
ErrorKind::UnexpectedState,
))
)),
}
}
@@ -554,15 +532,4 @@ impl CloudAdminApiClient {
.parse()
.unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
}
async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
where
O: FnMut() -> F,
F: Future<Output = Result<T, Error>>,
{
let cancel = CancellationToken::new(); // not really used
backoff::retry(op, |_| false, 1, 20, description, &cancel)
.await
.expect("cancellations are disabled")
}
}

View File

@@ -60,7 +60,6 @@ pub struct GarbageList {
/// see garbage, we saw some active tenants too. This protects against classes of bugs
/// in the scrubber that might otherwise generate a "deleted all" result.
active_tenant_count: usize,
active_timeline_count: usize,
}
impl GarbageList {
@@ -68,7 +67,6 @@ impl GarbageList {
Self {
items: Vec::new(),
active_tenant_count: 0,
active_timeline_count: 0,
node_kind,
bucket_config,
}
@@ -121,10 +119,7 @@ pub async fn find_garbage(
const S3_CONCURRENCY: usize = 32;
// How many concurrent API requests to make to the console API.
//
// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
// would be better to implement real rsp limiter.
const CONSOLE_CONCURRENCY: usize = 16;
const CONSOLE_CONCURRENCY: usize = 128;
struct ConsoleCache {
/// Set of tenants found in the control plane API
@@ -226,7 +221,6 @@ async fn find_garbage_inner(
} else {
tracing::debug!("Tenant {tenant_shard_id} is active");
active_tenants.push(tenant_shard_id);
garbage.active_tenant_count = active_tenants.len();
}
counter += 1;
@@ -267,7 +261,7 @@ async fn find_garbage_inner(
let api_client = cloud_admin_api_client.clone();
async move {
api_client
.find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
.find_timeline_branch(ttid.timeline_id)
.await
.map_err(|e| anyhow::anyhow!(e))
.map(|r| (ttid, r))
@@ -277,29 +271,15 @@ async fn find_garbage_inner(
std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
// Update the GarbageList with any timelines which appear not to exist.
let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
while let Some(result) = timelines_checked.next().await {
let (ttid, console_result) = result?;
if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
tracing::debug!("Timeline {ttid} is garbage");
} else {
tracing::debug!("Timeline {ttid} is active");
active_timelines.push(ttid);
garbage.active_timeline_count = active_timelines.len();
}
}
let num_garbage_timelines = garbage
.items
.iter()
.filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
.count();
tracing::info!(
"Found {}/{} garbage timelines in active tenants",
num_garbage_timelines,
active_timelines.len(),
);
Ok(garbage)
}
@@ -364,22 +344,16 @@ pub async fn get_timeline_objects(
const MAX_KEYS_PER_DELETE: usize = 1000;
/// Drain a buffer of keys into DeleteObjects requests
///
/// If `drain` is true, drains keys completely; otherwise stops when <
/// MAX_KEYS_PER_DELETE keys are left.
/// `num_deleted` returns number of deleted keys.
async fn do_delete(
s3_client: &Arc<Client>,
bucket_name: &str,
keys: &mut Vec<ObjectIdentifier>,
dry_run: bool,
drain: bool,
progress_tracker: &mut DeletionProgressTracker,
) -> anyhow::Result<()> {
while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
let request_keys =
keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
let num_deleted = request_keys.len();
if dry_run {
tracing::info!("Dry-run deletion of objects: ");
for k in request_keys {
@@ -394,30 +368,12 @@ async fn do_delete(
.send()
.await
.context("DeleteObjects request")?;
progress_tracker.register(num_deleted);
}
}
Ok(())
}
/// Simple tracker reporting each 10k deleted keys.
#[derive(Default)]
struct DeletionProgressTracker {
num_deleted: usize,
last_reported_num_deleted: usize,
}
impl DeletionProgressTracker {
fn register(&mut self, n: usize) {
self.num_deleted += n;
if self.num_deleted - self.last_reported_num_deleted > 10000 {
tracing::info!("progress: deleted {} keys", self.num_deleted);
self.last_reported_num_deleted = self.num_deleted;
}
}
}
pub async fn purge_garbage(
input_path: String,
mode: PurgeMode,
@@ -438,14 +394,6 @@ pub async fn purge_garbage(
if garbage_list.active_tenant_count == 0 {
anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
}
if garbage_list
.items
.iter()
.any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
&& garbage_list.active_timeline_count == 0
{
anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
}
let filtered_items = garbage_list
.items
@@ -481,7 +429,6 @@ pub async fn purge_garbage(
std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
let mut objects_to_delete = Vec::new();
let mut progress_tracker = DeletionProgressTracker::default();
while let Some(result) = get_objects_results.next().await {
let mut object_list = result?;
objects_to_delete.append(&mut object_list);
@@ -492,7 +439,6 @@ pub async fn purge_garbage(
&mut objects_to_delete,
dry_run,
false,
&mut progress_tracker,
)
.await?;
}
@@ -504,11 +450,10 @@ pub async fn purge_garbage(
&mut objects_to_delete,
dry_run,
true,
&mut progress_tracker,
)
.await?;
tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
tracing::info!("Fell through");
Ok(())
}

View File

@@ -4,9 +4,7 @@ pub mod checks;
pub mod cloud_admin_api;
pub mod garbage;
pub mod metadata_stream;
pub mod scan_pageserver_metadata;
pub mod scan_safekeeper_metadata;
pub mod tenant_snapshot;
pub mod scan_metadata;
use std::env;
use std::fmt::Display;
@@ -25,18 +23,17 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
use aws_sdk_s3::{Client, Config};
use aws_smithy_async::rt::sleep::TokioSleep;
use camino::{Utf8Path, Utf8PathBuf};
use clap::ValueEnum;
use pageserver::tenant::TENANTS_SEGMENT_NAME;
use pageserver_api::shard::TenantShardId;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use std::io::IsTerminal;
use tokio::io::AsyncReadExt;
use tracing::error;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
use utils::fs_ext;
use utils::id::{TenantId, TimelineId};
use utils::id::TimelineId;
const MAX_RETRIES: usize = 20;
const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -142,34 +139,12 @@ impl RootTarget {
pub fn tenants_root(&self) -> S3Target {
match self {
Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
Self::Safekeeper(root) => root.clone(),
Self::Safekeeper(root) => root.with_sub_segment("wal"),
}
}
pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
match self {
Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
Self::Safekeeper(_) => self
.tenants_root()
.with_sub_segment(&tenant_id.tenant_id.to_string()),
}
}
pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
// Only pageserver remote storage contains tenant-shards
assert!(matches!(self, Self::Pageserver(_)));
let Self::Pageserver(root) = self else {
panic!();
};
S3Target {
bucket_name: root.bucket_name.clone(),
prefix_in_bucket: format!(
"{}/{TENANTS_SEGMENT_NAME}/{tenant_id}",
root.prefix_in_bucket
),
delimiter: root.delimiter.clone(),
}
self.tenants_root().with_sub_segment(&tenant_id.to_string())
}
pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
@@ -265,6 +240,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
.with_ansi(false)
.with_writer(file_writer);
let stderr_logs = fmt::Layer::new()
.with_ansi(std::io::stderr().is_terminal())
.with_target(false)
.with_writer(std::io::stderr);
tracing_subscriber::registry()
@@ -343,7 +319,9 @@ fn init_remote(
}),
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
prefix_in_bucket: bucket_config
.prefix_in_bucket
.unwrap_or("safekeeper/v1".to_string()),
delimiter,
}),
};
@@ -368,10 +346,7 @@ async fn list_objects_with_retries(
{
Ok(response) => return Ok(response),
Err(e) => {
error!(
"list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
);
error!("list_objects_v2 query failed: {e}");
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
@@ -421,50 +396,3 @@ async fn download_object_with_retries(
anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
}
async fn download_object_to_file(
s3_client: &Client,
bucket_name: &str,
key: &str,
version_id: Option<&str>,
local_path: &Utf8Path,
) -> anyhow::Result<()> {
let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp"));
for _ in 0..MAX_RETRIES {
tokio::fs::remove_file(&tmp_path)
.await
.or_else(fs_ext::ignore_not_found)?;
let mut file = tokio::fs::File::create(&tmp_path)
.await
.context("Opening output file")?;
let request = s3_client.get_object().bucket(bucket_name).key(key);
let request = match version_id {
Some(version_id) => request.version_id(version_id),
None => request,
};
let response_stream = match request.send().await {
Ok(response) => response,
Err(e) => {
error!(
"Failed to download object for key {key} version {}: {e:#}",
version_id.unwrap_or("")
);
tokio::time::sleep(Duration::from_secs(1)).await;
continue;
}
};
let mut read_stream = response_stream.body.into_async_read();
tokio::io::copy(&mut read_stream, &mut file).await?;
tokio::fs::rename(&tmp_path, local_path).await?;
return Ok(());
}
anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
}

View File

@@ -1,16 +1,9 @@
use anyhow::bail;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use s3_scrubber::scan_pageserver_metadata::scan_metadata;
use s3_scrubber::tenant_snapshot::SnapshotDownloader;
use s3_scrubber::{
init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
NodeKind, TraversingDepth,
};
use s3_scrubber::scan_metadata::scan_metadata;
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
use clap::{Parser, Subcommand};
use utils::id::TenantId;
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
@@ -39,28 +32,11 @@ enum Command {
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
mode: PurgeMode,
},
#[command(verbatim_doc_comment)]
ScanMetadata {
#[arg(short, long)]
node_kind: NodeKind,
#[arg(short, long, default_value_t = false)]
json: bool,
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
#[arg(long, default_value = None)]
/// For safekeeper node_kind only, points to db with debug dump
dump_db_connstr: Option<String>,
/// For safekeeper node_kind only, table in the db with debug dump
#[arg(long, default_value = None)]
dump_db_table: Option<String>,
},
TenantSnapshot {
#[arg(long = "tenant-id")]
tenant_id: TenantId,
#[arg(long = "concurrency", short = 'j', default_value_t = 8)]
concurrency: usize,
#[arg(short, long)]
output_path: Utf8PathBuf,
},
}
@@ -74,7 +50,6 @@ async fn main() -> anyhow::Result<()> {
Command::ScanMetadata { .. } => "scan",
Command::FindGarbage { .. } => "find-garbage",
Command::PurgeGarbage { .. } => "purge-garbage",
Command::TenantSnapshot { .. } => "tenant-snapshot",
};
let _guard = init_logging(&format!(
"{}_{}_{}_{}.log",
@@ -85,75 +60,33 @@ async fn main() -> anyhow::Result<()> {
));
match cli.command {
Command::ScanMetadata {
json,
tenant_ids,
node_kind,
dump_db_connstr,
dump_db_table,
} => {
if let NodeKind::Safekeeper = node_kind {
let dump_db_connstr =
dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
let dump_db_table =
dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
let summary = scan_safekeeper_metadata(
bucket_config.clone(),
tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
dump_db_connstr,
dump_db_table,
)
.await?;
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
Command::ScanMetadata { json, tenant_ids } => {
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
if summary.is_fatal() {
bail!("Fatal scrub errors detected");
}
if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
bail!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
);
}
Ok(())
} else {
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
}
}
}
@@ -169,14 +102,5 @@ async fn main() -> anyhow::Result<()> {
Command::PurgeGarbage { input_path, mode } => {
purge_garbage(input_path, mode, !cli.delete).await
}
Command::TenantSnapshot {
tenant_id,
output_path,
concurrency,
} => {
let downloader =
SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
downloader.download().await
}
}
}

View File

@@ -5,7 +5,7 @@ use tokio_stream::Stream;
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
use pageserver_api::shard::TenantShardId;
use utils::id::{TenantId, TimelineId};
use utils::id::TimelineId;
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
pub fn stream_tenants<'a>(
@@ -45,62 +45,6 @@ pub fn stream_tenants<'a>(
}
}
pub async fn stream_tenant_shards<'a>(
s3_client: &'a Client,
target: &'a RootTarget,
tenant_id: TenantId,
) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
let mut continuation_token = None;
let shards_target = target.tenant_shards_prefix(&tenant_id);
loop {
tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
let fetch_response =
list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
let fetch_response = match fetch_response {
Err(e) => {
tenant_shard_ids.push(Err(e));
break;
}
Ok(r) => r,
};
let new_entry_ids = fetch_response
.common_prefixes()
.iter()
.filter_map(|prefix| prefix.prefix())
.filter_map(|prefix| -> Option<&str> {
prefix
.strip_prefix(&target.tenants_root().prefix_in_bucket)?
.strip_suffix('/')
})
.map(|entry_id_str| {
let first_part = entry_id_str.split('/').next().unwrap();
first_part
.parse::<TenantShardId>()
.with_context(|| format!("Incorrect entry id str: {first_part}"))
});
for i in new_entry_ids {
tenant_shard_ids.push(i);
}
match fetch_response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok(stream! {
for i in tenant_shard_ids {
let id = i?;
yield Ok(id);
}
})
}
/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
/// using ListObjectsv2. The listing is done before the stream is built, so that this
/// function can be used to generate concurrency on a stream using buffer_unordered.
@@ -114,7 +58,7 @@ pub async fn stream_tenant_timelines<'a>(
let timelines_target = target.timelines_root(&tenant);
loop {
tracing::debug!("Listing in {}", tenant);
tracing::info!("Listing in {}", tenant);
let fetch_response =
list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
.await;
@@ -151,7 +95,7 @@ pub async fn stream_tenant_timelines<'a>(
}
}
tracing::debug!("Yielding for {}", tenant);
tracing::info!("Yielding for {}", tenant);
Ok(stream! {
for i in timeline_ids {
let id = i?;

View File

@@ -1,236 +0,0 @@
use std::{collections::HashSet, str::FromStr};
use aws_sdk_s3::Client;
use futures::stream::{StreamExt, TryStreamExt};
use pageserver_api::shard::TenantShardId;
use postgres_ffi::{XLogFileName, PG_TLI};
use serde::Serialize;
use tokio_postgres::types::PgLsn;
use tracing::{error, info, trace};
use utils::{
id::{TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
};
use crate::{
cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
};
/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
#[derive(Serialize)]
pub struct MetadataSummary {
timeline_count: usize,
with_errors: HashSet<TenantTimelineId>,
deleted_count: usize,
}
impl MetadataSummary {
fn new() -> Self {
Self {
timeline_count: 0,
with_errors: HashSet::new(),
deleted_count: 0,
}
}
pub fn summary_string(&self) -> String {
format!(
"timeline_count: {}, with_errors: {}",
self.timeline_count,
self.with_errors.len()
)
}
pub fn is_empty(&self) -> bool {
self.timeline_count == 0
}
pub fn is_fatal(&self) -> bool {
!self.with_errors.is_empty()
}
}
/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
/// statistics.
///
/// It works by listing timelines along with timeline_start_lsn and backup_lsn
/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
/// segments are missing, before complaining control plane is queried to check if
/// the project wasn't deleted in the meanwhile.
pub async fn scan_safekeeper_metadata(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantId>,
dump_db_connstr: String,
dump_db_table: String,
) -> anyhow::Result<MetadataSummary> {
info!(
"checking bucket {}, region {}, dump_db_table {}",
bucket_config.bucket, bucket_config.region, dump_db_table
);
// Use the native TLS implementation (Neon requires TLS)
let tls_connector =
postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
let tenant_filter_clause = if !tenant_ids.is_empty() {
format!(
"and tenant_id in ({})",
tenant_ids
.iter()
.map(|t| format!("'{}'", t))
.collect::<Vec<_>>()
.join(", ")
)
} else {
"".to_owned()
};
let query = format!(
"select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
dump_db_table, tenant_filter_clause,
);
info!("query is {}", query);
let timelines = client.query(&query, &[]).await?;
info!("loaded {} timelines", timelines.len());
let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
let console_config = ConsoleConfig::from_env()?;
let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
let timeline_start_lsn_pg: PgLsn = row.get(2);
let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
let backup_lsn_pg: PgLsn = row.get(3);
let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
check_timeline(
&s3_client,
&target,
&cloud_admin_api_client,
ttid,
timeline_start_lsn,
backup_lsn,
)
});
// Run multiple check_timeline's concurrently.
const CONCURRENCY: usize = 32;
let mut timelines = checks.try_buffered(CONCURRENCY);
let mut summary = MetadataSummary::new();
while let Some(r) = timelines.next().await {
let res = r?;
summary.timeline_count += 1;
if !res.is_ok {
summary.with_errors.insert(res.ttid);
}
if res.is_deleted {
summary.deleted_count += 1;
}
}
Ok(summary)
}
struct TimelineCheckResult {
ttid: TenantTimelineId,
is_ok: bool,
is_deleted: bool, // timeline is deleted in cplane
}
/// List s3 and check that is has all expected WAL for the ttid. Consistency
/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
/// Ok(false) if not, Err if failed to check.
async fn check_timeline(
s3_client: &Client,
root: &RootTarget,
api_client: &CloudAdminApiClient,
ttid: TenantTimelineId,
timeline_start_lsn: Lsn,
backup_lsn: Lsn,
) -> anyhow::Result<TimelineCheckResult> {
trace!(
"checking ttid {}, should contain WAL [{}-{}]",
ttid,
timeline_start_lsn,
backup_lsn
);
// calculate expected segfiles
let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
(expected_first_segno..expected_last_segno)
.map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
);
let expected_files_num = expected_segfiles.len();
trace!("expecting {} files", expected_segfiles.len(),);
// now list s3 and check if it misses something
let ttshid =
TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
let mut timeline_dir_target = root.timeline_root(&ttshid);
// stream_listing yields only common_prefixes if delimiter is not empty, but
// we need files, so unset it.
timeline_dir_target.delimiter = String::new();
let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
while let Some(obj) = stream.next().await {
let obj = obj?;
let key = obj.key();
let seg_name = key
.strip_prefix(&timeline_dir_target.prefix_in_bucket)
.expect("failed to extract segment name");
expected_segfiles.remove(seg_name);
}
if !expected_segfiles.is_empty() {
// Before complaining check cplane, probably timeline is already deleted.
let bdata = api_client
.find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
.await?;
let deleted = match bdata {
Some(bdata) => bdata.deleted,
None => {
// note: should be careful with selecting proper cplane address
info!("ttid {} not found, assuming it is deleted", ttid);
true
}
};
if deleted {
// ok, branch is deleted
return Ok(TimelineCheckResult {
ttid,
is_ok: true,
is_deleted: true,
});
}
error!(
"ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
ttid,
expected_segfiles.len(),
expected_files_num,
timeline_start_lsn,
backup_lsn,
);
return Ok(TimelineCheckResult {
ttid,
is_ok: false,
is_deleted: false,
});
}
Ok(TimelineCheckResult {
ttid,
is_ok: true,
is_deleted: false,
})
}

View File

@@ -1,293 +0,0 @@
use std::collections::HashMap;
use std::sync::Arc;
use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
use crate::{
download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
};
use anyhow::Context;
use async_stream::stream;
use aws_sdk_s3::Client;
use camino::Utf8PathBuf;
use futures::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::IndexPart;
use pageserver_api::shard::TenantShardId;
use utils::generation::Generation;
use utils::id::TenantId;
pub struct SnapshotDownloader {
s3_client: Arc<Client>,
s3_root: RootTarget,
bucket_config: BucketConfig,
tenant_id: TenantId,
output_path: Utf8PathBuf,
concurrency: usize,
}
impl SnapshotDownloader {
pub fn new(
bucket_config: BucketConfig,
tenant_id: TenantId,
output_path: Utf8PathBuf,
concurrency: usize,
) -> anyhow::Result<Self> {
let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
Ok(Self {
s3_client,
s3_root,
bucket_config,
tenant_id,
output_path,
concurrency,
})
}
async fn download_layer(
&self,
ttid: TenantShardTimelineId,
layer_name: LayerFileName,
layer_metadata: IndexLayerMetadata,
) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
// Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use
// different layer names (remote-style has the generation suffix)
let local_path = self.output_path.join(format!(
"{}/timelines/{}/{}{}",
ttid.tenant_shard_id,
ttid.timeline_id,
layer_name.file_name(),
layer_metadata.generation.get_suffix()
));
// We should only be called for layers that are owned by the input TTID
assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index());
// Assumption: we always write layer files atomically, and layer files are immutable. Therefore if the file
// already exists on local disk, we assume it is fully correct and skip it.
if tokio::fs::try_exists(&local_path).await? {
tracing::debug!("{} already exists", local_path);
return Ok((layer_name, layer_metadata));
} else {
tracing::debug!("{} requires download...", local_path);
let timeline_root = self.s3_root.timeline_root(&ttid);
let remote_layer_path = format!(
"{}{}{}",
timeline_root.prefix_in_bucket,
layer_name.file_name(),
layer_metadata.generation.get_suffix()
);
// List versions: the object might be deleted.
let versions = self
.s3_client
.list_object_versions()
.bucket(self.bucket_config.bucket.clone())
.prefix(&remote_layer_path)
.send()
.await?;
let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
};
download_object_to_file(
&self.s3_client,
&self.bucket_config.bucket,
&remote_layer_path,
version.version_id.as_deref(),
&local_path,
)
.await?;
tracing::debug!("Downloaded successfully to {local_path}");
}
Ok((layer_name, layer_metadata))
}
/// Download many layers belonging to the same TTID, with some concurrency
async fn download_layers(
&self,
ttid: TenantShardTimelineId,
layers: Vec<(LayerFileName, IndexLayerMetadata)>,
) -> anyhow::Result<()> {
let layer_count = layers.len();
tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
let layers_stream = stream! {
for (layer_name, layer_metadata) in layers {
yield self.download_layer(ttid, layer_name, layer_metadata);
}
};
tokio::fs::create_dir_all(self.output_path.join(format!(
"{}/timelines/{}",
ttid.tenant_shard_id, ttid.timeline_id
)))
.await?;
let layer_results = layers_stream.buffered(self.concurrency);
let mut layer_results = std::pin::pin!(layer_results);
let mut err = None;
let mut download_count = 0;
while let Some(i) = layer_results.next().await {
download_count += 1;
match i {
Ok((layer_name, layer_metadata)) => {
tracing::info!(
"[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
layer_metadata.file_size,
layer_name.file_name()
);
}
Err(e) => {
// Warn and continue: we will download what we can
tracing::warn!("Download error: {e}");
err = Some(e);
}
}
}
if let Some(e) = err {
tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}");
Err(e)
} else {
Ok(())
}
}
async fn download_timeline(
&self,
ttid: TenantShardTimelineId,
index_part: IndexPart,
index_part_generation: Generation,
ancestor_layers: &mut HashMap<
TenantShardTimelineId,
HashMap<LayerFileName, IndexLayerMetadata>,
>,
) -> anyhow::Result<()> {
let index_bytes = serde_json::to_string(&index_part).unwrap();
let layers = index_part
.layer_metadata
.into_iter()
.filter_map(|(layer_name, layer_metadata)| {
if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count {
// Accumulate ancestor layers for later download
let ancestor_ttid = TenantShardTimelineId::new(
TenantShardId {
tenant_id: ttid.tenant_shard_id.tenant_id,
shard_number: layer_metadata.shard.shard_number,
shard_count: layer_metadata.shard.shard_count,
},
ttid.timeline_id,
);
let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default();
use std::collections::hash_map::Entry;
match ancestor_ttid_layers.entry(layer_name) {
Entry::Occupied(entry) => {
// Descendent shards that reference a layer from an ancestor should always have matching metadata,
// as their siblings, because it is read atomically during a shard split.
assert_eq!(entry.get(), &layer_metadata);
}
Entry::Vacant(entry) => {
entry.insert(layer_metadata);
}
}
None
} else {
Some((layer_name, layer_metadata))
}
})
.collect();
let download_result = self.download_layers(ttid, layers).await;
// Write index last, once all the layers it references are downloaded
let local_index_path = self.output_path.join(format!(
"{}/timelines/{}/index_part.json{}",
ttid.tenant_shard_id,
ttid.timeline_id,
index_part_generation.get_suffix()
));
tokio::fs::write(&local_index_path, index_bytes)
.await
.context("writing index")?;
download_result
}
pub async fn download(&self) -> anyhow::Result<()> {
let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
// Generate a stream of TenantShardId
let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
let shards: Vec<TenantShardId> = shards.try_collect().await?;
// Only read from shards that have the highest count: avoids redundantly downloading
// from ancestor shards.
let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else {
anyhow::bail!("No shards found");
};
// We will build a collection of layers in anccestor shards to download (this will only
// happen if this tenant has been split at some point)
let mut ancestor_layers: HashMap<
TenantShardTimelineId,
HashMap<LayerFileName, IndexLayerMetadata>,
> = Default::default();
for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
// Generate a stream of TenantTimelineId
let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
// Generate a stream of S3TimelineBlobData
async fn load_timeline_index(
s3_client: &Client,
target: &RootTarget,
ttid: TenantShardTimelineId,
) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
let data = list_timeline_blobs(s3_client, ttid, target).await?;
Ok((ttid, data))
}
let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
let mut timelines = std::pin::pin!(timelines.try_buffered(8));
while let Some(i) = timelines.next().await {
let (ttid, data) = i?;
match data.blob_data {
BlobDataParseResult::Parsed {
index_part,
index_part_generation,
s3_layers: _,
} => {
self.download_timeline(
ttid,
index_part,
index_part_generation,
&mut ancestor_layers,
)
.await
.context("Downloading timeline")?;
}
BlobDataParseResult::Relic => {}
BlobDataParseResult::Incorrect(_) => {
tracing::error!("Bad metadata in timeline {ttid}");
}
};
}
}
for (ttid, layers) in ancestor_layers.into_iter() {
tracing::info!(
"Downloading {} layers from ancvestor timeline {ttid}...",
layers.len()
);
self.download_layers(ttid, layers.into_iter().collect())
.await?;
}
Ok(())
}
}

View File

@@ -177,10 +177,6 @@ struct Args {
/// Controls how long backup will wait until uploading the partial segment.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
partial_backup_timeout: Duration,
/// Disable task to push messages to broker every second. Supposed to
/// be used in tests.
#[arg(long)]
disable_periodic_broker_push: bool,
}
// Like PathBufValueParser, but allows empty string.
@@ -313,7 +309,6 @@ async fn main() -> anyhow::Result<()> {
walsenders_keep_horizon: args.walsenders_keep_horizon,
partial_backup_enabled: args.partial_backup_enabled,
partial_backup_timeout: args.partial_backup_timeout,
disable_periodic_broker_push: args.disable_periodic_broker_push,
};
// initialize sentry if SENTRY_DSN is provided

Some files were not shown because too many files have changed in this diff Show More