mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-11 22:50:37 +00:00
Compare commits
26 Commits
nix-devenv
...
compute_ct
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a77dd0700c | ||
|
|
c9472434c9 | ||
|
|
f9c2945f74 | ||
|
|
5558457c84 | ||
|
|
26e6ff8ba6 | ||
|
|
50a45e67dc | ||
|
|
fcbe60f436 | ||
|
|
e018cac1f7 | ||
|
|
a74b60066c | ||
|
|
3a2f10712a | ||
|
|
4ac4b21598 | ||
|
|
9f792f9c0b | ||
|
|
7434674d86 | ||
|
|
ea37234ccc | ||
|
|
3da54e6d90 | ||
|
|
010f0a310a | ||
|
|
eb53345d48 | ||
|
|
45c625fb34 | ||
|
|
84b6b95783 | ||
|
|
577982b778 | ||
|
|
574645412b | ||
|
|
11945e64ec | ||
|
|
cddafc79e1 | ||
|
|
af7cca4949 | ||
|
|
89cae64e38 | ||
|
|
1f417af9fd |
4
.github/workflows/build_and_test.yml
vendored
4
.github/workflows/build_and_test.yml
vendored
@@ -478,6 +478,7 @@ jobs:
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: true
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
@@ -557,6 +558,9 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: false
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
88
Cargo.lock
generated
88
Cargo.lock
generated
@@ -722,9 +722,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_core"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
|
||||
checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.21.1",
|
||||
@@ -752,9 +752,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_identity"
|
||||
version = "0.18.1"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
|
||||
checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
|
||||
dependencies = [
|
||||
"async-lock",
|
||||
"async-trait",
|
||||
@@ -772,9 +772,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_storage"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
|
||||
checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"async-lock",
|
||||
@@ -791,9 +791,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_storage_blobs"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
|
||||
checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"azure_core",
|
||||
@@ -812,9 +812,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "azure_svc_blobstorage"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
|
||||
checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
|
||||
dependencies = [
|
||||
"azure_core",
|
||||
"bytes",
|
||||
@@ -1319,6 +1319,7 @@ dependencies = [
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
@@ -2763,9 +2764,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.63"
|
||||
version = "0.3.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
|
||||
checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
|
||||
dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
@@ -3184,6 +3185,16 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.1"
|
||||
@@ -3520,6 +3531,12 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "p256"
|
||||
version = "0.11.1"
|
||||
@@ -5095,8 +5112,11 @@ dependencies = [
|
||||
"hex",
|
||||
"histogram",
|
||||
"itertools",
|
||||
"native-tls",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres-native-tls",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
@@ -5105,8 +5125,10 @@ dependencies = [
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
@@ -6413,11 +6435,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.37"
|
||||
version = "0.1.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
|
||||
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"log",
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
@@ -6437,9 +6458,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.24"
|
||||
version = "0.1.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
|
||||
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -6448,9 +6469,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.31"
|
||||
version = "0.1.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
|
||||
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
@@ -6508,6 +6529,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -6905,9 +6927,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
|
||||
checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"wasm-bindgen-macro",
|
||||
@@ -6915,9 +6937,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
|
||||
checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"log",
|
||||
@@ -6930,9 +6952,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.36"
|
||||
version = "0.4.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
|
||||
checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
@@ -6942,9 +6964,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
|
||||
checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
@@ -6952,9 +6974,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
|
||||
checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -6965,9 +6987,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.86"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
|
||||
checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-streams"
|
||||
@@ -6999,9 +7021,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.63"
|
||||
version = "0.3.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
|
||||
checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -45,10 +45,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
azure_core = "0.18"
|
||||
azure_identity = "0.18"
|
||||
azure_storage = "0.18"
|
||||
azure_storage_blobs = "0.18"
|
||||
azure_core = "0.19"
|
||||
azure_identity = "0.19"
|
||||
azure_storage = "0.19"
|
||||
azure_storage_blobs = "0.19"
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.20.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
|
||||
18
Makefile
18
Makefile
@@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux)
|
||||
# Seccomp BPF is only available for Linux
|
||||
PG_CONFIGURE_OPTS += --with-libseccomp
|
||||
else ifeq ($(UNAME_S),Darwin)
|
||||
# macOS with brew-installed openssl requires explicit paths
|
||||
# It can be configured with OPENSSL_PREFIX variable
|
||||
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
|
||||
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
||||
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
|
||||
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
|
||||
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
|
||||
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
|
||||
ifndef DISABLE_HOMEBREW
|
||||
# macOS with brew-installed openssl requires explicit paths
|
||||
# It can be configured with OPENSSL_PREFIX variable
|
||||
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
|
||||
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
||||
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
|
||||
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
|
||||
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
|
||||
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use -C option so that when PostgreSQL "make install" installs the
|
||||
|
||||
@@ -51,6 +51,7 @@ use tracing::{error, info};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
use compute_tools::compute::{
|
||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||
@@ -68,6 +69,29 @@ use compute_tools::spec::*;
|
||||
const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let (build_tag, clap_args) = init()?;
|
||||
|
||||
let (pg_handle, start_pg_result) =
|
||||
{
|
||||
// Enter startup tracing context
|
||||
let _startup_context_guard = startup_context_from_env();
|
||||
|
||||
let cli_result = process_cli(&clap_args)?;
|
||||
|
||||
let wait_spec_result = wait_spec(build_tag, cli_result)?;
|
||||
|
||||
start_postgres(&clap_args, wait_spec_result)?
|
||||
|
||||
// Startup is finished, exit the startup tracing context
|
||||
};
|
||||
|
||||
// PostgreSQL is now running, if startup was successful. Wait until it exits.
|
||||
let wait_pg_result = wait_postgres(pg_handle)?;
|
||||
|
||||
cleanup_and_exit(start_pg_result, wait_pg_result)
|
||||
}
|
||||
|
||||
fn init() -> Result<(String, clap::ArgMatches)> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
@@ -82,35 +106,11 @@ fn main() -> Result<()> {
|
||||
.to_string();
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let ext_remote_storage = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||
// use the default value for extension storage proxy gateway.
|
||||
// Remove this once the control plane is updated to pass the gateway URL
|
||||
.map(|conf| {
|
||||
if conf.starts_with("http") {
|
||||
conf.trim_end_matches('/')
|
||||
} else {
|
||||
"http://pg-ext-s3-gateway"
|
||||
}
|
||||
});
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
.expect("http-port is required");
|
||||
let pgdata = matches
|
||||
.get_one::<String>("pgdata")
|
||||
.expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.get_one::<String>("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
Ok((build_tag, cli().get_matches()))
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
|
||||
{
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -147,7 +147,7 @@ fn main() -> Result<()> {
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
@@ -157,8 +157,42 @@ fn main() -> Result<()> {
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
fn process_cli(
|
||||
matches: &clap::ArgMatches,
|
||||
) -> Result<ProcessCliResult> {
|
||||
let pgbin_default = "postgres";
|
||||
let pgbin = matches
|
||||
.get_one::<String>("pgbin")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(pgbin_default);
|
||||
|
||||
let ext_remote_storage = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||
// use the default value for extension storage proxy gateway.
|
||||
// Remove this once the control plane is updated to pass the gateway URL
|
||||
.map(|conf| {
|
||||
if conf.starts_with("http") {
|
||||
conf.trim_end_matches('/')
|
||||
} else {
|
||||
"http://pg-ext-s3-gateway"
|
||||
}
|
||||
});
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
.expect("http-port is required");
|
||||
let pgdata = matches
|
||||
.get_one::<String>("pgdata")
|
||||
.expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.get_one::<String>("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
@@ -199,6 +233,45 @@ fn main() -> Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
let result = ProcessCliResult {
|
||||
// directly from CLI:
|
||||
connstr,
|
||||
pgdata,
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
http_port,
|
||||
// others:
|
||||
spec,
|
||||
live_config_allowed,
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
struct ProcessCliResult<'clap> {
|
||||
connstr: &'clap str,
|
||||
pgdata: &'clap str,
|
||||
pgbin: &'clap str,
|
||||
ext_remote_storage: Option<&'clap str>,
|
||||
http_port: u16,
|
||||
|
||||
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||
spec: Option<ComputeSpec>,
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
fn wait_spec(
|
||||
build_tag: String,
|
||||
ProcessCliResult {
|
||||
connstr,
|
||||
pgdata,
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
http_port,
|
||||
spec,
|
||||
live_config_allowed,
|
||||
}: ProcessCliResult,
|
||||
) -> Result<WaitSpecResult> {
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
@@ -237,8 +310,6 @@ fn main() -> Result<()> {
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
@@ -255,6 +326,19 @@ fn main() -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(WaitSpecResult { compute, http_port })
|
||||
}
|
||||
|
||||
struct WaitSpecResult {
|
||||
compute: Arc<ComputeNode>,
|
||||
// passed through from ProcessCliResult
|
||||
http_port: u16,
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
matches: &clap::ArgMatches,
|
||||
WaitSpecResult { compute, http_port }: WaitSpecResult,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
@@ -281,9 +365,10 @@ fn main() -> Result<()> {
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
let mut exit_code = None;
|
||||
let pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
@@ -334,7 +419,7 @@ fn main() -> Result<()> {
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
let vm_monitor = &rt.as_ref().map(|rt| {
|
||||
let vm_monitor = rt.as_ref().map(|rt| {
|
||||
rt.spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
@@ -347,12 +432,43 @@ fn main() -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
pg,
|
||||
StartPostgresResult {
|
||||
delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
|
||||
|
||||
struct StartPostgresResult {
|
||||
delay_exit: bool,
|
||||
// passed through from WaitSpecResult
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
rt: Option<tokio::runtime::Runtime>,
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
fn wait_postgres(
|
||||
pg: Option<PostgresHandle>,
|
||||
) -> Result<WaitPostgresResult> {
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
let mut exit_code = None;
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
// Startup is finished, exit the startup tracing span
|
||||
drop(startup_context_guard);
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
@@ -367,6 +483,26 @@ fn main() -> Result<()> {
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
Ok(WaitPostgresResult { exit_code })
|
||||
}
|
||||
|
||||
struct WaitPostgresResult {
|
||||
exit_code: Option<i32>,
|
||||
}
|
||||
|
||||
fn cleanup_and_exit(
|
||||
StartPostgresResult {
|
||||
mut delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
}: StartPostgresResult,
|
||||
WaitPostgresResult { exit_code }: WaitPostgresResult,
|
||||
) -> Result<()> {
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
|
||||
@@ -17,6 +17,7 @@ nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
hex.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
|
||||
@@ -1554,8 +1554,8 @@ fn cli() -> Command {
|
||||
Command::new("storage_controller")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.subcommand(Command::new("start").about("Start storage controller"))
|
||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
)
|
||||
.subcommand(
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::net::Ipv4Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims},
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -66,6 +67,10 @@ pub struct LocalEnv {
|
||||
|
||||
pub broker: NeonBroker,
|
||||
|
||||
// Configuration for the storage controller (1 per neon_local environment)
|
||||
#[serde(default)]
|
||||
pub storage_controller: NeonStorageControllerConf,
|
||||
|
||||
/// This Vec must always contain at least one pageserver
|
||||
pub pageservers: Vec<PageServerConf>,
|
||||
|
||||
@@ -98,6 +103,29 @@ pub struct NeonBroker {
|
||||
pub listen_addr: SocketAddr,
|
||||
}
|
||||
|
||||
/// Broker config for cluster internal communication.
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct NeonStorageControllerConf {
|
||||
/// Heartbeat timeout before marking a node offline
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_unavailable: Duration,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
|
||||
std::time::Duration::from_secs(10);
|
||||
}
|
||||
|
||||
impl Default for NeonStorageControllerConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Dummy Default impl to satisfy Deserialize derive.
|
||||
impl Default for NeonBroker {
|
||||
fn default() -> Self {
|
||||
@@ -130,6 +158,7 @@ pub struct PageServerConf {
|
||||
pub(crate) virtual_file_io_engine: Option<String>,
|
||||
pub(crate) get_vectored_impl: Option<String>,
|
||||
pub(crate) get_impl: Option<String>,
|
||||
pub(crate) validate_vectored_get: Option<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -143,6 +172,7 @@ impl Default for PageServerConf {
|
||||
virtual_file_io_engine: None,
|
||||
get_vectored_impl: None,
|
||||
get_impl: None,
|
||||
validate_vectored_get: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +93,7 @@ impl PageServerNode {
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
get_impl,
|
||||
validate_vectored_get,
|
||||
} = &self.conf;
|
||||
|
||||
let id = format!("id={}", id);
|
||||
@@ -117,6 +118,11 @@ impl PageServerNode {
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
|
||||
format!("validate_vectored_get={validate_vectored_get}")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
@@ -131,6 +137,7 @@ impl PageServerNode {
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
get_impl,
|
||||
validate_vectored_get,
|
||||
];
|
||||
|
||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use crate::{
|
||||
background_process,
|
||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||
};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
@@ -32,15 +35,13 @@ pub struct StorageController {
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
config: NeonStorageControllerConf,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -135,6 +136,7 @@ impl StorageController {
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
config: env.storage_controller.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -272,8 +274,6 @@ impl StorageController {
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
@@ -283,7 +283,7 @@ impl StorageController {
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-unavailable-interval",
|
||||
&max_unavailable.to_string(),
|
||||
&humantime::Duration::from(self.config.max_unavailable).to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
|
||||
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
|
||||
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
||||
rebuilt on startup.
|
||||
|
||||
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||
The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||
|
||||
The `diesel` crate is used for defining models & migrations.
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::key::Key;
|
||||
use crate::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity},
|
||||
};
|
||||
use itertools::Itertools;
|
||||
|
||||
///
|
||||
@@ -14,6 +17,238 @@ pub struct KeySpace {
|
||||
pub ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
/// A wrapper type for sparse keyspaces.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct SparseKeySpace(pub KeySpace);
|
||||
|
||||
/// Represents a contiguous half-open range of the keyspace, masked according to a particular
|
||||
/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
|
||||
/// shard.
|
||||
///
|
||||
/// When we iterate over keys within this object, we will skip any keys that don't belong
|
||||
/// to this shard.
|
||||
///
|
||||
/// The start + end keys may not belong to the shard: these specify where layer files should
|
||||
/// start + end, but we will never actually read/write those keys.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct ShardedRange<'a> {
|
||||
pub shard_identity: &'a ShardIdentity,
|
||||
pub range: Range<Key>,
|
||||
}
|
||||
|
||||
// Calculate the size of a range within the blocks of the same relation, or spanning only the
|
||||
// top page in the previous relation's space.
|
||||
fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
debug_assert!(is_contiguous_range(range));
|
||||
if range.start.field6 == 0xffffffff {
|
||||
range.end.field6 + 1
|
||||
} else {
|
||||
range.end.field6 - range.start.field6
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if this key range includes only keys in the same relation's data blocks, or
|
||||
/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
|
||||
///
|
||||
/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
|
||||
/// be on our shard. Later in ShardedRange we do the extra work to figure out how much
|
||||
/// of a given contiguous range is present on one shard.
|
||||
///
|
||||
/// This matters, because:
|
||||
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
|
||||
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
|
||||
fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||
range.start.field1 == range.end.field1
|
||||
&& range.start.field2 == range.end.field2
|
||||
&& range.start.field3 == range.end.field3
|
||||
&& range.start.field4 == range.end.field4
|
||||
&& (range.start.field5 == range.end.field5
|
||||
|| (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
|
||||
}
|
||||
|
||||
impl<'a> ShardedRange<'a> {
|
||||
pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
|
||||
Self {
|
||||
shard_identity,
|
||||
range,
|
||||
}
|
||||
}
|
||||
|
||||
/// Break up this range into chunks, each of which has at least one local key in it if the
|
||||
/// total range has at least one local key.
|
||||
pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
|
||||
// Optimization for single-key case (e.g. logical size keys)
|
||||
if self.range.end == self.range.start.add(1) {
|
||||
return vec![(
|
||||
if self.shard_identity.is_key_disposable(&self.range.start) {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
},
|
||||
self.range,
|
||||
)];
|
||||
}
|
||||
|
||||
if !is_contiguous_range(&self.range) {
|
||||
// Ranges that span relations are not fragmented. We only get these ranges as a result
|
||||
// of operations that act on existing layers, so we trust that the existing range is
|
||||
// reasonably small.
|
||||
return vec![(u32::MAX, self.range)];
|
||||
}
|
||||
|
||||
let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
|
||||
|
||||
let mut cursor = self.range.start;
|
||||
while cursor < self.range.end {
|
||||
let advance_by = self.distance_to_next_boundary(cursor);
|
||||
let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
|
||||
|
||||
// If the previous fragment is undersized, then we seek to consume enough
|
||||
// blocks to complete it.
|
||||
let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
|
||||
Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
|
||||
Some(frag) => {
|
||||
// Prev block is complete, want the full number.
|
||||
(
|
||||
target_nblocks,
|
||||
if is_fragment_disposable {
|
||||
// If this current range will be empty (not shard-local data), we will merge into previous
|
||||
Some(frag)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
)
|
||||
}
|
||||
None => {
|
||||
// First iteration, want the full number
|
||||
(target_nblocks, None)
|
||||
}
|
||||
};
|
||||
|
||||
let advance_by = if is_fragment_disposable {
|
||||
advance_by
|
||||
} else {
|
||||
std::cmp::min(advance_by, want_blocks)
|
||||
};
|
||||
|
||||
let next_cursor = cursor.add(advance_by);
|
||||
|
||||
let this_frag = (
|
||||
if is_fragment_disposable {
|
||||
0
|
||||
} else {
|
||||
advance_by
|
||||
},
|
||||
cursor..next_cursor,
|
||||
);
|
||||
cursor = next_cursor;
|
||||
|
||||
if let Some(last_fragment) = merge_last_fragment {
|
||||
// Previous fragment was short or this one is empty, merge into it
|
||||
last_fragment.0 += this_frag.0;
|
||||
last_fragment.1.end = this_frag.1.end;
|
||||
} else {
|
||||
fragments.push(this_frag);
|
||||
}
|
||||
}
|
||||
|
||||
fragments
|
||||
}
|
||||
|
||||
/// Estimate the physical pages that are within this range, on this shard. This returns
|
||||
/// u32::MAX if the range spans relations: this return value should be interpreted as "large".
|
||||
pub fn page_count(&self) -> u32 {
|
||||
// Special cases for single keys like logical sizes
|
||||
if self.range.end == self.range.start.add(1) {
|
||||
return if self.shard_identity.is_key_disposable(&self.range.start) {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
}
|
||||
|
||||
// We can only do an authentic calculation of contiguous key ranges
|
||||
if !is_contiguous_range(&self.range) {
|
||||
return u32::MAX;
|
||||
}
|
||||
|
||||
// Special case for single sharded tenants: our logical and physical sizes are the same
|
||||
if self.shard_identity.count < ShardCount::new(2) {
|
||||
return contiguous_range_len(&self.range);
|
||||
}
|
||||
|
||||
// Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
|
||||
// to Self, and add the stripe's block count to our total if so.
|
||||
let mut result: u64 = 0;
|
||||
let mut cursor = self.range.start;
|
||||
while cursor < self.range.end {
|
||||
// Count up to the next stripe_size boundary or end of range
|
||||
let advance_by = self.distance_to_next_boundary(cursor);
|
||||
|
||||
// If this blocks in this stripe belong to us, add them to our count
|
||||
if !self.shard_identity.is_key_disposable(&cursor) {
|
||||
result += advance_by as u64;
|
||||
}
|
||||
|
||||
cursor = cursor.add(advance_by);
|
||||
}
|
||||
|
||||
if result > u32::MAX as u64 {
|
||||
u32::MAX
|
||||
} else {
|
||||
result as u32
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance the cursor to the next potential fragment boundary: this is either
|
||||
/// a stripe boundary, or the end of the range.
|
||||
fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
|
||||
let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
|
||||
|
||||
if self.shard_identity.count < ShardCount::new(2) {
|
||||
// Optimization: don't bother stepping through stripes if the tenant isn't sharded.
|
||||
return distance_to_range_end;
|
||||
}
|
||||
|
||||
if cursor.field6 == 0xffffffff {
|
||||
// We are wrapping from one relation's logical size to the next relation's first data block
|
||||
return 1;
|
||||
}
|
||||
|
||||
let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
|
||||
let stripe_remainder = self.shard_identity.stripe_size.0
|
||||
- (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// We should never overflow field5 and field6 -- our callers check this earlier
|
||||
// and would have returned their u32::MAX cases if the input range violated this.
|
||||
let next_cursor = cursor.add(stripe_remainder);
|
||||
debug_assert!(
|
||||
next_cursor.field1 == cursor.field1
|
||||
&& next_cursor.field2 == cursor.field2
|
||||
&& next_cursor.field3 == cursor.field3
|
||||
&& next_cursor.field4 == cursor.field4
|
||||
&& next_cursor.field5 == cursor.field5
|
||||
)
|
||||
}
|
||||
|
||||
std::cmp::min(stripe_remainder, distance_to_range_end)
|
||||
}
|
||||
|
||||
/// Whereas `page_count` estimates the number of pages physically in this range on this shard,
|
||||
/// this function simply calculates the number of pages in the space, without accounting for those
|
||||
/// pages that would not actually be stored on this node.
|
||||
///
|
||||
/// Don't use this function in code that works with physical entities like layer files.
|
||||
fn raw_size(range: &Range<Key>) -> u32 {
|
||||
if is_contiguous_range(range) {
|
||||
contiguous_range_len(range)
|
||||
} else {
|
||||
u32::MAX
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeySpace {
|
||||
/// Create a key space with a single range.
|
||||
pub fn single(key_range: Range<Key>) -> Self {
|
||||
@@ -25,39 +260,36 @@ impl KeySpace {
|
||||
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
|
||||
/// in each partition.
|
||||
///
|
||||
pub fn partition(&self, target_size: u64) -> KeyPartitioning {
|
||||
pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
|
||||
// Assume that each value is 8k in size.
|
||||
let target_nblocks = (target_size / BLCKSZ as u64) as usize;
|
||||
let target_nblocks = (target_size / BLCKSZ as u64) as u32;
|
||||
|
||||
let mut parts = Vec::new();
|
||||
let mut current_part = Vec::new();
|
||||
let mut current_part_size: usize = 0;
|
||||
for range in &self.ranges {
|
||||
// If appending the next contiguous range in the keyspace to the current
|
||||
// partition would cause it to be too large, start a new partition.
|
||||
let this_size = key_range_size(range) as usize;
|
||||
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
|
||||
parts.push(KeySpace {
|
||||
ranges: current_part,
|
||||
});
|
||||
current_part = Vec::new();
|
||||
current_part_size = 0;
|
||||
}
|
||||
// While doing partitioning, wrap the range in ShardedRange so that our size calculations
|
||||
// will respect shard striping rather than assuming all keys within a range are present.
|
||||
let range = ShardedRange::new(range.clone(), shard_identity);
|
||||
|
||||
// If the next range is larger than 'target_size', split it into
|
||||
// 'target_size' chunks.
|
||||
let mut remain_size = this_size;
|
||||
let mut start = range.start;
|
||||
while remain_size > target_nblocks {
|
||||
let next = start.add(target_nblocks as u32);
|
||||
parts.push(KeySpace {
|
||||
ranges: vec![start..next],
|
||||
});
|
||||
start = next;
|
||||
remain_size -= target_nblocks
|
||||
// Chunk up the range into parts that each contain up to target_size local blocks
|
||||
for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
|
||||
// If appending the next contiguous range in the keyspace to the current
|
||||
// partition would cause it to be too large, and our current partition
|
||||
// covers at least one block that is physically present in this shard,
|
||||
// then start a new partition
|
||||
if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
|
||||
&& current_part_size > 0
|
||||
{
|
||||
parts.push(KeySpace {
|
||||
ranges: current_part,
|
||||
});
|
||||
current_part = Vec::new();
|
||||
current_part_size = 0;
|
||||
}
|
||||
current_part.push(frag_range.start..frag_range.end);
|
||||
current_part_size += frag_on_shard_size as usize;
|
||||
}
|
||||
current_part.push(start..range.end);
|
||||
current_part_size += remain_size;
|
||||
}
|
||||
|
||||
// add last partition that wasn't full yet.
|
||||
@@ -71,7 +303,7 @@ impl KeySpace {
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.total_size() == 0
|
||||
self.total_raw_size() == 0
|
||||
}
|
||||
|
||||
/// Merge another keyspace into the current one.
|
||||
@@ -164,11 +396,11 @@ impl KeySpace {
|
||||
self.ranges.last().map(|range| range.end)
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub fn total_size(&self) -> usize {
|
||||
/// The size of the keyspace in pages, before accounting for sharding
|
||||
pub fn total_raw_size(&self) -> usize {
|
||||
self.ranges
|
||||
.iter()
|
||||
.map(|range| key_range_size(range) as usize)
|
||||
.map(|range| ShardedRange::raw_size(range) as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
@@ -207,10 +439,33 @@ pub struct KeyPartitioning {
|
||||
pub parts: Vec<KeySpace>,
|
||||
}
|
||||
|
||||
/// Represents a partitioning of the sparse key space.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct SparseKeyPartitioning {
|
||||
pub parts: Vec<SparseKeySpace>,
|
||||
}
|
||||
|
||||
impl KeyPartitioning {
|
||||
pub fn new() -> Self {
|
||||
KeyPartitioning { parts: Vec::new() }
|
||||
}
|
||||
|
||||
/// Convert a key partitioning to a sparse partition.
|
||||
pub fn into_sparse(self) -> SparseKeyPartitioning {
|
||||
SparseKeyPartitioning {
|
||||
parts: self.parts.into_iter().map(SparseKeySpace).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SparseKeyPartitioning {
|
||||
/// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
|
||||
/// cause long/dead loops.
|
||||
pub fn into_dense(self) -> KeyPartitioning {
|
||||
KeyPartitioning {
|
||||
parts: self.parts.into_iter().map(|x| x.0).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -242,7 +497,7 @@ impl KeySpaceAccum {
|
||||
|
||||
#[inline(always)]
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
self.size += key_range_size(&range) as u64;
|
||||
self.size += ShardedRange::raw_size(&range) as u64;
|
||||
|
||||
match self.accum.as_mut() {
|
||||
Some(accum) => {
|
||||
@@ -274,7 +529,9 @@ impl KeySpaceAccum {
|
||||
std::mem::take(self).to_keyspace()
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
// The total number of keys in this object, ignoring any sharding effects that might cause some of
|
||||
// the keys to be omitted in storage on this shard.
|
||||
pub fn raw_size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
@@ -330,36 +587,19 @@ impl KeySpaceRandomAccum {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
let start = key_range.start;
|
||||
let end = key_range.end;
|
||||
|
||||
if end.field1 != start.field1
|
||||
|| end.field2 != start.field2
|
||||
|| end.field3 != start.field3
|
||||
|| end.field4 != start.field4
|
||||
{
|
||||
return u32::MAX;
|
||||
}
|
||||
|
||||
let start = (start.field5 as u64) << 32 | start.field6 as u64;
|
||||
let end = (end.field5 as u64) << 32 | end.field6 as u64;
|
||||
|
||||
let diff = end - start;
|
||||
if diff > u32::MAX as u64 {
|
||||
u32::MAX
|
||||
} else {
|
||||
diff as u32
|
||||
}
|
||||
}
|
||||
|
||||
pub fn singleton_range(key: Key) -> Range<Key> {
|
||||
key..key.next()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use crate::{
|
||||
models::ShardParameters,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use std::fmt::Write;
|
||||
|
||||
@@ -402,14 +642,17 @@ mod tests {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
|
||||
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
|
||||
assert_eq!(accum.size(), expected_size);
|
||||
let expected_size: u64 = ranges
|
||||
.iter()
|
||||
.map(|r| ShardedRange::raw_size(r) as u64)
|
||||
.sum();
|
||||
assert_eq!(accum.raw_size(), expected_size);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
|
||||
assert_eq!(accum.size(), 0);
|
||||
assert_eq!(accum.raw_size(), 0);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), vec![]);
|
||||
assert_eq!(accum.size(), 0);
|
||||
assert_eq!(accum.raw_size(), 0);
|
||||
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
@@ -706,4 +949,412 @@ mod tests {
|
||||
]
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn sharded_range_relation_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
|
||||
end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
|
||||
// Key range spans relations, expect MAX
|
||||
assert_eq!(range.page_count(), u32::MAX);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_single_key() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
|
||||
end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
// Single-key range on logical size key
|
||||
assert_eq!(range.page_count(), 1);
|
||||
}
|
||||
|
||||
/// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
|
||||
#[test]
|
||||
fn contiguous_range_check() {
|
||||
assert!(!is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
|
||||
),);
|
||||
|
||||
// The ranges goes all the way up to the 0xffffffff, including it: this is
|
||||
// not considered a rel block range because 0xffffffff stores logical sizes,
|
||||
// not blocks.
|
||||
assert!(!is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
|
||||
),);
|
||||
|
||||
// Keys within the normal data region of a relation
|
||||
assert!(is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
|
||||
),);
|
||||
|
||||
// The logical size key of one forkno, then some blocks in the next
|
||||
assert!(is_contiguous_range(
|
||||
&(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
|
||||
..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
|
||||
),);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_forkno_gap() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
|
||||
end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
|
||||
// Range spanning the end of one forkno and the start of the next: we do not attempt to
|
||||
// calculate a valid size, because we have no way to know if they keys between start
|
||||
// and end are actually in use.
|
||||
assert_eq!(range.page_count(), u32::MAX);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_identity_keyspaces_one_relation() {
|
||||
for shard_number in 0..4 {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
|
||||
end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
|
||||
},
|
||||
&shard_identity,
|
||||
);
|
||||
|
||||
// Very simple case: range covering block zero of one relation, where that block maps to shard zero
|
||||
if shard_number == 0 {
|
||||
assert_eq!(range.page_count(), 1);
|
||||
} else {
|
||||
// Other shards should perceive the range's size as zero
|
||||
assert_eq!(range.page_count(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test helper: construct a ShardedRange and call fragment() on it, returning
|
||||
/// the total page count in the range and the fragments.
|
||||
fn do_fragment(
|
||||
range_start: Key,
|
||||
range_end: Key,
|
||||
shard_identity: &ShardIdentity,
|
||||
target_nblocks: u32,
|
||||
) -> (u32, Vec<(u32, Range<Key>)>) {
|
||||
let range = ShardedRange::new(
|
||||
Range {
|
||||
start: range_start,
|
||||
end: range_end,
|
||||
},
|
||||
shard_identity,
|
||||
);
|
||||
|
||||
let page_count = range.page_count();
|
||||
let fragments = range.fragment(target_nblocks);
|
||||
|
||||
// Invariant: we always get at least one fragment
|
||||
assert!(!fragments.is_empty());
|
||||
|
||||
// Invariant: the first/last fragment start/end should equal the input start/end
|
||||
assert_eq!(fragments.first().unwrap().1.start, range_start);
|
||||
assert_eq!(fragments.last().unwrap().1.end, range_end);
|
||||
|
||||
if page_count > 0 {
|
||||
// Invariant: every fragment must contain at least one shard-local page, if the
|
||||
// total range contains at least one shard-local page
|
||||
let all_nonzero = fragments.iter().all(|f| f.0 > 0);
|
||||
if !all_nonzero {
|
||||
eprintln!("Found a zero-length fragment: {:?}", fragments);
|
||||
}
|
||||
assert!(all_nonzero);
|
||||
} else {
|
||||
// A range with no shard-local pages should always be returned as a single fragment
|
||||
assert_eq!(fragments, vec![(0, range_start..range_end)]);
|
||||
}
|
||||
|
||||
// Invariant: fragments must be ordered and non-overlapping
|
||||
let mut last: Option<Range<Key>> = None;
|
||||
for frag in &fragments {
|
||||
if let Some(last) = last {
|
||||
assert!(frag.1.start >= last.end);
|
||||
assert!(frag.1.start > last.start);
|
||||
}
|
||||
last = Some(frag.1.clone())
|
||||
}
|
||||
|
||||
// Invariant: fragments respect target_nblocks
|
||||
for frag in &fragments {
|
||||
assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
|
||||
}
|
||||
|
||||
(page_count, fragments)
|
||||
}
|
||||
|
||||
/// Really simple tests for fragment(), on a range that just contains a single stripe
|
||||
/// for a single tenant.
|
||||
#[test]
|
||||
fn sharded_range_fragment_simple() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which we happen to know covers exactly one stripe which belongs to this shard
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
|
||||
|
||||
// Ask for stripe_size blocks, we get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 32768),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for more, we still get the whole stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 10000000),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for target_nblocks of half the stripe size, we get two halves
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16384),
|
||||
(
|
||||
32768,
|
||||
vec![
|
||||
(16384, input_start..input_start.add(16384)),
|
||||
(16384, input_start.add(16384)..input_end)
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_multi_stripe() {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
// Ask for all the blocks, get a fragment that covers the whole range but reports
|
||||
// its size to be just the blocks belonging to our shard.
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 131072),
|
||||
(32768, vec![(32768, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Ask for a sub-stripe quantity
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16000),
|
||||
(
|
||||
32768,
|
||||
vec![
|
||||
(16000, input_start..input_start.add(16000)),
|
||||
(16000, input_start.add(16000)..input_start.add(32000)),
|
||||
(768, input_start.add(32000)..input_end),
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
// Try on a range that starts slightly after our owned stripe
|
||||
assert_eq!(
|
||||
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
|
||||
(32767, vec![(32767, input_start.add(1)..input_end)])
|
||||
);
|
||||
}
|
||||
|
||||
/// Test our calculations work correctly when we start a range from the logical size key of
|
||||
/// a previous relation.
|
||||
#[test]
|
||||
fn sharded_range_fragment_starting_from_logical_size() {
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
|
||||
|
||||
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x8001, vec![(0x8001, input_start..input_end)])
|
||||
);
|
||||
|
||||
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
|
||||
// store all logical sizes)
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||
(0x1, vec![(0x1, input_start..input_end)])
|
||||
);
|
||||
}
|
||||
|
||||
/// Test that ShardedRange behaves properly when used on un-sharded data
|
||||
#[test]
|
||||
fn sharded_range_fragment_unsharded() {
|
||||
let shard_identity = ShardIdentity::unsharded();
|
||||
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(
|
||||
0x10000,
|
||||
vec![
|
||||
(0x8000, input_start..input_start.add(0x8000)),
|
||||
(0x8000, input_start.add(0x8000)..input_start.add(0x10000))
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_cross_relation() {
|
||||
let shard_identity = ShardIdentity::unsharded();
|
||||
|
||||
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||
);
|
||||
|
||||
// Same, but using a sharded identity
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount::new(4),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_tiny_nblocks() {
|
||||
let shard_identity = ShardIdentity::unsharded();
|
||||
|
||||
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||
let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
|
||||
assert_eq!(
|
||||
do_fragment(input_start, input_end, &shard_identity, 16),
|
||||
(
|
||||
0x38,
|
||||
vec![
|
||||
(16, input_start..input_start.add(16)),
|
||||
(16, input_start.add(16)..input_start.add(32)),
|
||||
(16, input_start.add(32)..input_start.add(48)),
|
||||
(8, input_start.add(48)..input_end),
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_range_fragment_fuzz() {
|
||||
// Use a fixed seed: we don't want to explicitly pick values, but we do want
|
||||
// the test to be reproducible.
|
||||
let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
|
||||
|
||||
for _i in 0..1000 {
|
||||
let shard_identity = if prng.next_u32() % 2 == 0 {
|
||||
ShardIdentity::unsharded()
|
||||
} else {
|
||||
let shard_count = prng.next_u32() % 127 + 1;
|
||||
ShardIdentity::new(
|
||||
ShardNumber((prng.next_u32() % shard_count) as u8),
|
||||
ShardCount::new(shard_count as u8),
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let target_nblocks = prng.next_u32() % 65536 + 1;
|
||||
|
||||
let start_offset = prng.next_u32() % 16384;
|
||||
|
||||
// Try ranges up to 4GiB in size, that are always at least 1
|
||||
let range_size = prng.next_u32() % 8192 + 1;
|
||||
|
||||
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||
let input_start = Key::from_hex("000000067F00000001000004E10000000000")
|
||||
.unwrap()
|
||||
.add(start_offset);
|
||||
let input_end = input_start.add(range_size);
|
||||
|
||||
// This test's main success conditions are the invariants baked into do_fragment
|
||||
let (_total_size, fragments) =
|
||||
do_fragment(input_start, input_end, &shard_identity, target_nblocks);
|
||||
|
||||
// Pick a random key within the range and check it appears in the output
|
||||
let example_key = input_start.add(prng.next_u32() % range_size);
|
||||
|
||||
// Panic on unwrap if it isn't found
|
||||
let example_key_frag = fragments
|
||||
.iter()
|
||||
.find(|f| f.1.contains(&example_key))
|
||||
.unwrap();
|
||||
|
||||
// Check that the fragment containing our random key has a nonzero size if
|
||||
// that key is shard-local
|
||||
let example_key_local = !shard_identity.is_key_disposable(&example_key);
|
||||
if example_key_local {
|
||||
assert!(example_key_frag.0 > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::keyspace::SparseKeySpace;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Partitioning {
|
||||
pub keys: crate::keyspace::KeySpace,
|
||||
|
||||
pub sparse_keys: crate::keyspace::SparseKeySpace,
|
||||
pub at_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("sparse_keys")?;
|
||||
map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
|
||||
#[derive(serde::Deserialize)]
|
||||
struct De {
|
||||
keys: KeySpace,
|
||||
sparse_keys: KeySpace,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
|
||||
Ok(Self {
|
||||
at_lsn: de.at_lsn,
|
||||
keys: de.keys.0,
|
||||
sparse_keys: SparseKeySpace(de.sparse_keys.0),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -133,6 +139,12 @@ mod tests {
|
||||
"030000000000000000000000000000000003"
|
||||
]
|
||||
],
|
||||
"sparse_keys": [
|
||||
[
|
||||
"620000000000000000000000000000000000",
|
||||
"620000000000000000000000000000000003"
|
||||
]
|
||||
],
|
||||
"at_lsn": "0/2240160"
|
||||
}
|
||||
"#;
|
||||
|
||||
@@ -451,7 +451,7 @@ impl ShardIdentity {
|
||||
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
|
||||
/// tenants. Modern single-shard tenants should not use this: they should
|
||||
/// have number=0 count=1.
|
||||
pub fn unsharded() -> Self {
|
||||
pub const fn unsharded() -> Self {
|
||||
Self {
|
||||
number: ShardNumber(0),
|
||||
count: ShardCount(0),
|
||||
@@ -538,24 +538,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
|
||||
///
|
||||
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
|
||||
/// as a symptom of that issue.
|
||||
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
|
||||
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut hash = murmurhash32(key.field4);
|
||||
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
|
||||
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
|
||||
|
||||
// The key may be affected by issue #7454: it is an initfork and it would not
|
||||
// have mapped to shard 0 until we fixed that issue.
|
||||
mapped_shard != ShardNumber(0)
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split.
|
||||
///
|
||||
|
||||
@@ -2,11 +2,10 @@
|
||||
|
||||
use std::cmp::{Eq, Ordering};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch::{channel, Receiver, Sender};
|
||||
use tokio::sync::watch::{self, channel};
|
||||
use tokio::time::timeout;
|
||||
|
||||
/// An error happened while waiting for a number
|
||||
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
|
||||
fn cnt_value(&self) -> V;
|
||||
}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt<S, V>
|
||||
/// Heap of waiters, lowest numbers pop first.
|
||||
struct Waiters<V>
|
||||
where
|
||||
S: MonotonicCounter<V>,
|
||||
V: Ord,
|
||||
{
|
||||
waiters: BinaryHeap<Waiter<V>>,
|
||||
current: S,
|
||||
shutdown: bool,
|
||||
heap: BinaryHeap<Waiter<V>>,
|
||||
/// Number of the first waiter in the heap, or None if there are no waiters.
|
||||
status_channel: watch::Sender<Option<V>>,
|
||||
}
|
||||
|
||||
impl<V> Waiters<V>
|
||||
where
|
||||
V: Ord + Copy,
|
||||
{
|
||||
fn new() -> Self {
|
||||
Waiters {
|
||||
heap: BinaryHeap::new(),
|
||||
status_channel: channel(None).0,
|
||||
}
|
||||
}
|
||||
|
||||
/// `status_channel` contains the number of the first waiter in the heap.
|
||||
/// This function should be called whenever waiters heap changes.
|
||||
fn update_status(&self) {
|
||||
let first_waiter = self.heap.peek().map(|w| w.wake_num);
|
||||
let _ = self.status_channel.send_replace(first_waiter);
|
||||
}
|
||||
|
||||
/// Add new waiter to the heap, return a channel that will be notified when the number arrives.
|
||||
fn add(&mut self, num: V) -> watch::Receiver<()> {
|
||||
let (tx, rx) = channel(());
|
||||
self.heap.push(Waiter {
|
||||
wake_num: num,
|
||||
wake_channel: tx,
|
||||
});
|
||||
self.update_status();
|
||||
rx
|
||||
}
|
||||
|
||||
/// Pop all waiters <= num from the heap. Collect channels in a vector,
|
||||
/// so that caller can wake them up.
|
||||
fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
|
||||
let mut wake_these = Vec::new();
|
||||
while let Some(n) = self.heap.peek() {
|
||||
if n.wake_num > num {
|
||||
break;
|
||||
}
|
||||
wake_these.push(self.heap.pop().unwrap().wake_channel);
|
||||
}
|
||||
self.update_status();
|
||||
wake_these
|
||||
}
|
||||
|
||||
/// Used on shutdown to efficiently drop all waiters.
|
||||
fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
|
||||
let heap = mem::take(&mut self.heap);
|
||||
self.update_status();
|
||||
heap
|
||||
}
|
||||
}
|
||||
|
||||
struct Waiter<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
wake_num: T, // wake me when this number arrives ...
|
||||
wake_channel: Sender<()>, // ... by sending a message to this channel
|
||||
wake_num: T, // wake me when this number arrives ...
|
||||
wake_channel: watch::Sender<()>, // ... by sending a message to this channel
|
||||
}
|
||||
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {
|
||||
|
||||
impl<T: Ord> Eq for Waiter<T> {}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt<S, V>
|
||||
where
|
||||
S: MonotonicCounter<V>,
|
||||
V: Ord,
|
||||
{
|
||||
waiters: Waiters<V>,
|
||||
current: S,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
/// A tool for waiting on a sequence number
|
||||
///
|
||||
/// This provides a way to wait the arrival of a number.
|
||||
@@ -108,7 +168,7 @@ where
|
||||
/// Create a new `SeqWait`, initialized to a particular number
|
||||
pub fn new(starting_num: S) -> Self {
|
||||
let internal = SeqWaitInt {
|
||||
waiters: BinaryHeap::new(),
|
||||
waiters: Waiters::new(),
|
||||
current: starting_num,
|
||||
shutdown: false,
|
||||
};
|
||||
@@ -128,9 +188,8 @@ where
|
||||
// Block any future waiters from starting
|
||||
internal.shutdown = true;
|
||||
|
||||
// This will steal the entire waiters map.
|
||||
// When we drop it all waiters will be woken.
|
||||
mem::take(&mut internal.waiters)
|
||||
// Take all waiters to drop them later.
|
||||
internal.waiters.take_all()
|
||||
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
@@ -196,7 +255,7 @@ where
|
||||
|
||||
/// Register and return a channel that will be notified when a number arrives,
|
||||
/// or None, if it has already arrived.
|
||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||
fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
if internal.current.cnt_value() >= num {
|
||||
return Ok(None);
|
||||
@@ -205,12 +264,8 @@ where
|
||||
return Err(SeqWaitError::Shutdown);
|
||||
}
|
||||
|
||||
// Create a new channel.
|
||||
let (tx, rx) = channel(());
|
||||
internal.waiters.push(Waiter {
|
||||
wake_num: num,
|
||||
wake_channel: tx,
|
||||
});
|
||||
// Add waiter channel to the queue.
|
||||
let rx = internal.waiters.add(num);
|
||||
// Drop the lock as we exit this scope.
|
||||
Ok(Some(rx))
|
||||
}
|
||||
@@ -231,16 +286,8 @@ where
|
||||
}
|
||||
internal.current.cnt_advance(num);
|
||||
|
||||
// Pop all waiters <= num from the heap. Collect them in a vector, and
|
||||
// wake them up after releasing the lock.
|
||||
let mut wake_these = Vec::new();
|
||||
while let Some(n) = internal.waiters.peek() {
|
||||
if n.wake_num > num {
|
||||
break;
|
||||
}
|
||||
wake_these.push(internal.waiters.pop().unwrap().wake_channel);
|
||||
}
|
||||
wake_these
|
||||
// Pop all waiters <= num from the heap.
|
||||
internal.waiters.pop_leq(num)
|
||||
};
|
||||
|
||||
for tx in wake_these {
|
||||
@@ -255,6 +302,23 @@ where
|
||||
pub fn load(&self) -> S {
|
||||
self.internal.lock().unwrap().current
|
||||
}
|
||||
|
||||
/// Get a Receiver for the current status.
|
||||
///
|
||||
/// The current status is the number of the first waiter in the queue,
|
||||
/// or None if there are no waiters.
|
||||
///
|
||||
/// This receiver will be notified whenever the status changes.
|
||||
/// It is useful for receiving notifications when the first waiter
|
||||
/// starts waiting for a number, or when there are no more waiters left.
|
||||
pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
|
||||
self.internal
|
||||
.lock()
|
||||
.unwrap()
|
||||
.waiters
|
||||
.status_channel
|
||||
.subscribe()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
//! database size. For example, if the logical database size is 10 GB, we would
|
||||
//! generate new image layers every 10 GB of WAL.
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
@@ -125,6 +126,7 @@ async fn compact_level<E: CompactionJobExecutor>(
|
||||
}
|
||||
|
||||
let mut state = LevelCompactionState {
|
||||
shard_identity: *executor.get_shard_identity(),
|
||||
target_file_size,
|
||||
_lsn_range: lsn_range.clone(),
|
||||
layers: layer_fragments,
|
||||
@@ -164,6 +166,8 @@ struct LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
shard_identity: ShardIdentity,
|
||||
|
||||
// parameters
|
||||
target_file_size: u64,
|
||||
|
||||
@@ -366,6 +370,7 @@ where
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
&self.shard_identity,
|
||||
) * 8192;
|
||||
|
||||
let wal_size = job
|
||||
@@ -430,7 +435,7 @@ where
|
||||
keyspace,
|
||||
self.target_file_size / 8192,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image() {
|
||||
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
@@ -623,7 +628,12 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
}
|
||||
|
||||
// Advance the cursor until it reaches 'target_keysize'.
|
||||
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
|
||||
fn advance_until_size(
|
||||
&mut self,
|
||||
w: &KeyspaceWindowHead<K>,
|
||||
max_size: u64,
|
||||
shard_identity: &ShardIdentity,
|
||||
) {
|
||||
while self.accum_keysize < max_size && !self.reached_end(w) {
|
||||
let curr_range = &w.keyspace[self.keyspace_idx];
|
||||
if self.end_key < curr_range.start {
|
||||
@@ -632,7 +642,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
}
|
||||
|
||||
// We're now within 'curr_range'. Can we advance past it completely?
|
||||
let distance = K::key_range_size(&(self.end_key..curr_range.end));
|
||||
let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
// oh yeah, it fits
|
||||
self.end_key = curr_range.end;
|
||||
@@ -641,7 +651,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
} else {
|
||||
// advance within the range
|
||||
let skip_key = self.end_key.skip_some();
|
||||
let distance = K::key_range_size(&(self.end_key..skip_key));
|
||||
let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
self.end_key = skip_key;
|
||||
self.accum_keysize += distance as u64;
|
||||
@@ -677,7 +687,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
fn choose_next_image(&mut self) -> Option<Range<K>> {
|
||||
fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
|
||||
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
|
||||
// we've reached the end
|
||||
return None;
|
||||
@@ -687,6 +697,7 @@ where
|
||||
next_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + self.head.target_keysize,
|
||||
shard_identity,
|
||||
);
|
||||
|
||||
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
|
||||
@@ -695,6 +706,7 @@ where
|
||||
end_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
|
||||
shard_identity,
|
||||
);
|
||||
if end_pos.reached_end(&self.head) {
|
||||
// gobble up any unused keyspace between the last used key and end of the range
|
||||
|
||||
@@ -5,6 +5,7 @@ use crate::interface::*;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{Stream, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::VecDeque;
|
||||
@@ -13,11 +14,17 @@ use std::ops::{DerefMut, Range};
|
||||
use std::pin::Pin;
|
||||
use std::task::{ready, Poll};
|
||||
|
||||
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
|
||||
pub fn keyspace_total_size<K>(
|
||||
keyspace: &CompactionKeySpace<K>,
|
||||
shard_identity: &ShardIdentity,
|
||||
) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
|
||||
keyspace
|
||||
.iter()
|
||||
.map(|r| K::key_range_size(r, shard_identity) as u64)
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
use futures::Future;
|
||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||
use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -32,6 +32,8 @@ pub trait CompactionJobExecutor {
|
||||
// Functions that the planner uses to support its decisions
|
||||
// ----
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||
|
||||
/// Return all layers that overlap the given bounding box.
|
||||
fn get_layers(
|
||||
&mut self,
|
||||
@@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
|
||||
///
|
||||
/// This returns u32, for compatibility with Repository::key. If the
|
||||
/// distance is larger, return u32::MAX.
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32;
|
||||
fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
|
||||
|
||||
// return "self + 1"
|
||||
fn next(&self) -> Self;
|
||||
@@ -113,8 +115,8 @@ impl CompactionKey for Key {
|
||||
const MIN: Self = Self::MIN;
|
||||
const MAX: Self = Self::MAX;
|
||||
|
||||
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
|
||||
key_range_size(r)
|
||||
fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
|
||||
ShardedRange::new(r.clone(), shard_identity).page_count()
|
||||
}
|
||||
fn next(&self) -> Key {
|
||||
(self as &Key).next()
|
||||
|
||||
@@ -3,6 +3,7 @@ mod draw;
|
||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use rand::Rng;
|
||||
use tracing::info;
|
||||
|
||||
@@ -71,7 +72,7 @@ impl interface::CompactionKey for Key {
|
||||
const MIN: Self = u64::MIN;
|
||||
const MAX: Self = u64::MAX;
|
||||
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32 {
|
||||
fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
|
||||
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
|
||||
}
|
||||
|
||||
@@ -434,6 +435,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
|
||||
type ImageLayer = Arc<MockImageLayer>;
|
||||
type RequestContext = MockRequestContext;
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
|
||||
&IDENTITY
|
||||
}
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
|
||||
@@ -10,10 +10,10 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
|
||||
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BasebackupError {
|
||||
#[error("basebackup pageserver error {0:#}")]
|
||||
Server(#[from] anyhow::Error),
|
||||
#[error("basebackup client error {0:#}")]
|
||||
Client(#[source] io::Error),
|
||||
}
|
||||
|
||||
/// Create basebackup with non-rel data in it.
|
||||
/// Only include relational data if 'full_backup' is true.
|
||||
///
|
||||
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
ctx: &'a RequestContext,
|
||||
) -> anyhow::Result<()>
|
||||
) -> Result<(), BasebackupError>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
@@ -92,8 +100,10 @@ where
|
||||
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) {
|
||||
ensure!(backup_prev == provided_prev_lsn);
|
||||
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
|
||||
)));
|
||||
}
|
||||
provided_prev_lsn
|
||||
} else {
|
||||
@@ -159,15 +169,26 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
|
||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||
|
||||
match kind {
|
||||
SlruKind::Clog => {
|
||||
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
|
||||
if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"invalid SlruKind::Clog record: block.len()={}",
|
||||
block.len()
|
||||
)));
|
||||
}
|
||||
}
|
||||
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
|
||||
ensure!(block.len() == BLCKSZ as usize);
|
||||
if block.len() != BLCKSZ as usize {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"invalid {:?} record: block.len()={}",
|
||||
kind,
|
||||
block.len()
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,12 +215,15 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> anyhow::Result<()> {
|
||||
async fn flush(&mut self) -> Result<(), BasebackupError> {
|
||||
let nblocks = self.buf.len() / BLCKSZ as usize;
|
||||
let (kind, segno) = self.current_segment.take().unwrap();
|
||||
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
||||
self.ar
|
||||
.append(&header, self.buf.as_slice())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
self.total_blocks += nblocks;
|
||||
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
@@ -209,7 +233,7 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn finish(mut self) -> anyhow::Result<()> {
|
||||
async fn finish(mut self) -> Result<(), BasebackupError> {
|
||||
let res = if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
async fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||
async fn send_tarball(mut self) -> Result<(), BasebackupError> {
|
||||
// TODO include checksum
|
||||
|
||||
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
|
||||
@@ -262,16 +286,25 @@ where
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.partition(
|
||||
self.timeline.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(part, self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
slru_builder.add_block(&key, block?).await?;
|
||||
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
slru_builder.add_block(&key, block).await?;
|
||||
}
|
||||
}
|
||||
slru_builder.finish().await?;
|
||||
@@ -279,8 +312,11 @@ where
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
for ((spcnode, dbnode), has_relmap_file) in self
|
||||
.timeline
|
||||
.list_dbdirs(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
@@ -289,7 +325,8 @@ where
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
// contents of UNLOGGED relations. Postgres copies it in
|
||||
@@ -297,20 +334,7 @@ where
|
||||
if rel.forknum == INIT_FORKNUM {
|
||||
// I doubt we need _init fork itself, but having it at least
|
||||
// serves as a marker relation is unlogged.
|
||||
if let Err(_e) = self.add_rel(rel, rel).await {
|
||||
if self
|
||||
.timeline
|
||||
.get_shard_identity()
|
||||
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
|
||||
{
|
||||
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
|
||||
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
|
||||
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
|
||||
// recreate.
|
||||
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
self.add_rel(rel, rel).await?;
|
||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||
continue;
|
||||
}
|
||||
@@ -325,7 +349,12 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
|
||||
for (path, content) in self
|
||||
.timeline
|
||||
.list_aux_files(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
@@ -356,34 +385,41 @@ where
|
||||
for xid in self
|
||||
.timeline
|
||||
.list_twophase_files(self.lsn, self.ctx)
|
||||
.await?
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
{
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
|
||||
fail_point!("basebackup-before-control-file", |_| {
|
||||
bail!("failpoint basebackup-before-control-file")
|
||||
Err(BasebackupError::Server(anyhow!(
|
||||
"failpoint basebackup-before-control-file"
|
||||
)))
|
||||
});
|
||||
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file().await?;
|
||||
self.ar.finish().await?;
|
||||
self.ar.finish().await.map_err(BasebackupError::Client)?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add contents of relfilenode `src`, naming it as `dst`.
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
if nblocks == 0 {
|
||||
let file_name = dst.to_segfile_name(0);
|
||||
let header = new_tar_header(&file_name, 0)?;
|
||||
self.ar.append(&header, &mut io::empty()).await?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -398,13 +434,17 @@ where
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
let file_name = dst.to_segfile_name(seg as u32);
|
||||
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
||||
self.ar.append(&header, segment_data.as_slice()).await?;
|
||||
self.ar
|
||||
.append(&header, segment_data.as_slice())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
seg += 1;
|
||||
startblk = endblk;
|
||||
@@ -424,20 +464,22 @@ where
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
has_relmap_file: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), BasebackupError> {
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
ensure!(
|
||||
img.len()
|
||||
== dispatch_pgversion!(
|
||||
self.timeline.pg_version,
|
||||
pgv::bindings::SIZEOF_RELMAPFILE
|
||||
)
|
||||
);
|
||||
if img.len()
|
||||
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
|
||||
{
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
|
||||
img.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
Some(img)
|
||||
} else {
|
||||
@@ -450,14 +492,20 @@ where
|
||||
ver => format!("{ver}\x0A"),
|
||||
};
|
||||
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
info!("timeline.pg_version {}", self.timeline.pg_version);
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
// filenode map for global tablespace
|
||||
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &img[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
} else {
|
||||
warn!("global/pg_filenode.map is missing");
|
||||
}
|
||||
@@ -476,18 +524,26 @@ where
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.is_empty()
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
// User defined tablespaces are not supported
|
||||
ensure!(spcnode == DEFAULTTABLESPACE_OID);
|
||||
if spcnode != DEFAULTTABLESPACE_OID {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
|
||||
)));
|
||||
}
|
||||
|
||||
// Append dir path for each database
|
||||
let path = format!("base/{}", dbnode);
|
||||
let header = new_tar_header_dir(&path)?;
|
||||
self.ar.append(&header, &mut io::empty()).await?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
@@ -497,11 +553,17 @@ where
|
||||
ver => format!("{ver}\x0A"),
|
||||
};
|
||||
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &img[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
@@ -510,11 +572,12 @@ where
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_twophase_file(xid, self.lsn, self.ctx)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -522,7 +585,10 @@ where
|
||||
buf.put_u32_le(crc);
|
||||
let path = format!("pg_twophase/{:>08X}", xid);
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
self.ar.append(&header, &buf[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &buf[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -531,24 +597,28 @@ where
|
||||
// Add generated pg_control file and bootstrap WAL segment.
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.lsn == self.timeline.get_ancestor_lsn() {
|
||||
write!(zenith_signal, "PREV LSN: none")?;
|
||||
write!(zenith_signal, "PREV LSN: none")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")?;
|
||||
write!(zenith_signal, "PREV LSN: invalid")
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
}
|
||||
self.ar
|
||||
.append(
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
@@ -570,7 +640,10 @@ where
|
||||
|
||||
//send pg_control
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &pg_control_bytes[..]).await?;
|
||||
self.ar
|
||||
.append(&header, &pg_control_bytes[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
@@ -585,8 +658,16 @@ where
|
||||
self.lsn,
|
||||
)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..]).await?;
|
||||
if wal_seg.len() != WAL_SEGMENT_SIZE {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
|
||||
wal_seg.len()
|
||||
)));
|
||||
}
|
||||
self.ar
|
||||
.append(&header, &wal_seg[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1918,12 +1918,14 @@ async fn timeline_collect_keyspace(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let keys = timeline
|
||||
let (dense_ks, sparse_ks) = timeline
|
||||
.collect_keyspace(at_lsn, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
|
||||
// This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
|
||||
// Therefore, we split dense/sparse keys in this API.
|
||||
let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
|
||||
|
||||
json_response(StatusCode::OK, res)
|
||||
}
|
||||
|
||||
@@ -48,6 +48,7 @@ use utils::{
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
@@ -1236,6 +1237,13 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
fn map_basebackup_error(err: BasebackupError) -> QueryError {
|
||||
match err {
|
||||
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
|
||||
BasebackupError::Server(e) => QueryError::Other(e),
|
||||
}
|
||||
}
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
@@ -1261,7 +1269,8 @@ impl PageServerHandler {
|
||||
let lsn_awaited_after = started.elapsed();
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)
|
||||
.map_err(QueryError::Disconnected)?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||
@@ -1276,7 +1285,8 @@ impl PageServerHandler {
|
||||
full_backup,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
} else {
|
||||
let mut writer = pgb.copyout_writer();
|
||||
if gzip {
|
||||
@@ -1297,9 +1307,13 @@ impl PageServerHandler {
|
||||
full_backup,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
encoder.shutdown().await?;
|
||||
encoder
|
||||
.shutdown()
|
||||
.await
|
||||
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
|
||||
} else {
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
@@ -1309,11 +1323,13 @@ impl PageServerHandler {
|
||||
full_backup,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
}
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)
|
||||
.map_err(QueryError::Disconnected)?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
let basebackup_after = started
|
||||
|
||||
@@ -23,6 +23,7 @@ use pageserver_api::key::{
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -730,11 +731,13 @@ impl Timeline {
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
/// that LSN forwards).
|
||||
///
|
||||
/// The return value is (dense keyspace, sparse keyspace).
|
||||
pub(crate) async fn collect_keyspace(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<KeySpace, CollectKeySpaceError> {
|
||||
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
@@ -806,7 +809,12 @@ impl Timeline {
|
||||
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
Ok(result.to_keyspace())
|
||||
|
||||
Ok((
|
||||
result.to_keyspace(),
|
||||
/* AUX sparse key space */
|
||||
SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
|
||||
))
|
||||
}
|
||||
|
||||
/// Get cached size of relation if it not updated after specified LSN
|
||||
|
||||
@@ -3873,6 +3873,7 @@ mod tests {
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::key::NON_INHERITED_RANGE;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
@@ -4512,11 +4513,23 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn bulk_insert_compact_gc(
|
||||
timeline: Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
lsn: Lsn,
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
) -> anyhow::Result<()> {
|
||||
let compact = true;
|
||||
bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
|
||||
}
|
||||
|
||||
async fn bulk_insert_maybe_compact_gc(
|
||||
timeline: Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
mut lsn: Lsn,
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
compact: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let mut blknum = 0;
|
||||
@@ -4557,9 +4570,11 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
timeline.freeze_and_flush().await?;
|
||||
timeline
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
|
||||
.await?;
|
||||
if compact {
|
||||
timeline
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
|
||||
.await?;
|
||||
}
|
||||
timeline.gc().await?;
|
||||
}
|
||||
|
||||
@@ -5042,7 +5057,22 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_random_updates")?;
|
||||
let names_algorithms = [
|
||||
("test_random_updates_legacy", CompactionAlgorithm::Legacy),
|
||||
("test_random_updates_tiered", CompactionAlgorithm::Tiered),
|
||||
];
|
||||
for (name, algorithm) in names_algorithms {
|
||||
test_random_updates_algorithm(name, algorithm).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn test_random_updates_algorithm(
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -5107,7 +5137,7 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
// Perform a cycle of flush, compact, and GC
|
||||
// Perform a cycle of flush, and GC
|
||||
let cutoff = tline.get_last_record_lsn();
|
||||
tline
|
||||
.update_gc_info(
|
||||
@@ -5119,9 +5149,6 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
tline.freeze_and_flush().await?;
|
||||
tline
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
.await?;
|
||||
tline.gc().await?;
|
||||
}
|
||||
|
||||
@@ -5402,19 +5429,36 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_read_at_max_lsn")?;
|
||||
let names_algorithms = [
|
||||
("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
|
||||
("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
|
||||
];
|
||||
for (name, algorithm) in names_algorithms {
|
||||
test_read_at_max_lsn_algorithm(name, algorithm).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn test_read_at_max_lsn_algorithm(
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||
let compact = false;
|
||||
bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
|
||||
|
||||
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let read_lsn = Lsn(u64::MAX - 1);
|
||||
|
||||
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
|
||||
let result = tline.get(test_key, read_lsn, &ctx).await;
|
||||
assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -916,6 +916,7 @@ mod tests {
|
||||
assert_eq!(lhs, rhs);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn brute_force_range_search(
|
||||
layer_map: &LayerMap,
|
||||
key_range: Range<Key>,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
@@ -253,17 +254,15 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
|
||||
/// the slower actual deletion in the background.
|
||||
///
|
||||
/// This is "safe" in that that it won't leave behind a partially deleted directory
|
||||
/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
|
||||
/// the contents.
|
||||
///
|
||||
/// This is pageserver-specific, as it relies on future processes after a crash to check
|
||||
/// for TEMP_FILE_SUFFIX when loading things.
|
||||
async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
|
||||
let tmp_path = safe_rename_tenant_dir(path).await?;
|
||||
fs::remove_dir_all(tmp_path).await
|
||||
}
|
||||
|
||||
async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
|
||||
let parent = path
|
||||
.as_ref()
|
||||
@@ -286,6 +285,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
Ok(tmp_path)
|
||||
}
|
||||
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
/// the background, and thereby avoid blocking any API requests on this deletion completing.
|
||||
fn spawn_background_purge(tmp_path: Utf8PathBuf) {
|
||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||
let task_tenant_id = None;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
task_tenant_id,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
@@ -570,7 +591,11 @@ pub async fn init_tenant_mgr(
|
||||
);
|
||||
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
|
||||
|
||||
// Construct `Tenant` objects and start them running
|
||||
// Accumulate futures for writing tenant configs, so that we can execute in parallel
|
||||
let mut config_write_futs = Vec::new();
|
||||
|
||||
// Update the location configs according to the re-attach response and persist them to disk
|
||||
tracing::info!("Updating {} location configs", tenant_configs.len());
|
||||
for (tenant_shard_id, location_conf) in tenant_configs {
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
|
||||
@@ -597,18 +622,22 @@ pub async fn init_tenant_mgr(
|
||||
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
|
||||
SecondaryLocationConfig { warm: true };
|
||||
|
||||
// Update the location config according to the re-attach response
|
||||
if let Some(tenant_modes) = &tenant_modes {
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
match tenant_modes.get(&tenant_shard_id) {
|
||||
None => {
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
|
||||
match safe_rename_tenant_dir(&tenant_dir_path).await {
|
||||
Ok(tmp_path) => {
|
||||
spawn_background_purge(tmp_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
|
||||
}
|
||||
};
|
||||
|
||||
// We deleted local content: move on to next tenant, don't try and spawn this one.
|
||||
continue;
|
||||
@@ -654,8 +683,32 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
// Presence of a generation number implies attachment: attach the tenant
|
||||
// if it wasn't already, and apply the generation number.
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
config_write_futs.push(async move {
|
||||
let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
|
||||
(tenant_shard_id, location_conf, r)
|
||||
});
|
||||
}
|
||||
|
||||
// Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
|
||||
tracing::info!(
|
||||
"Writing {} location config files...",
|
||||
config_write_futs.len()
|
||||
);
|
||||
let config_write_results = futures::stream::iter(config_write_futs)
|
||||
.buffer_unordered(16)
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
|
||||
tracing::info!(
|
||||
"Spawning {} tenant shard locations...",
|
||||
config_write_results.len()
|
||||
);
|
||||
// For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
|
||||
for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
|
||||
// Errors writing configs are fatal
|
||||
config_write_result?;
|
||||
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
let shard_identity = location_conf.shard;
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => {
|
||||
@@ -1699,7 +1752,7 @@ impl TenantManager {
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
self.spawn_background_purge(tmp_path);
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
@@ -1854,28 +1907,6 @@ impl TenantManager {
|
||||
shutdown_all_tenants0(self.tenants).await
|
||||
}
|
||||
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
/// the background, and thereby avoid blocking any API requests on this deletion completing.
|
||||
fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
|
||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||
let task_tenant_id = None;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
task_tenant_id,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) async fn detach_tenant(
|
||||
&self,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -1892,7 +1923,7 @@ impl TenantManager {
|
||||
deletion_queue_client,
|
||||
)
|
||||
.await?;
|
||||
self.spawn_background_purge(tmp_path);
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashSet};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -78,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
index: HashMap<Key, VecMap<Lsn, u64>>,
|
||||
index: BTreeMap<Key, VecMap<Lsn, u64>>,
|
||||
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
@@ -384,29 +384,24 @@ impl InMemoryLayer {
|
||||
let mut planned_block_reads = BinaryHeap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
|
||||
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
|
||||
None => self.start_lsn..end_lsn,
|
||||
};
|
||||
for (key, vec_map) in inner.index.range(range.start..range.end) {
|
||||
let lsn_range = match reconstruct_state.get_cached_lsn(key) {
|
||||
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
|
||||
None => self.start_lsn..end_lsn,
|
||||
};
|
||||
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
planned_block_reads.push(BlockRead {
|
||||
key,
|
||||
lsn: *entry_lsn,
|
||||
block_offset: *pos,
|
||||
});
|
||||
}
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
planned_block_reads.push(BlockRead {
|
||||
key: *key,
|
||||
lsn: *entry_lsn,
|
||||
block_offset: *pos,
|
||||
});
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
let keyspace_size = keyspace.total_size();
|
||||
let keyspace_size = keyspace.total_raw_size();
|
||||
|
||||
let mut completed_keys = HashSet::new();
|
||||
while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
|
||||
@@ -499,7 +494,7 @@ impl InMemoryLayer {
|
||||
end_lsn: OnceLock::new(),
|
||||
opened_at: Instant::now(),
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
index: HashMap::new(),
|
||||
index: BTreeMap::new(),
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
@@ -602,14 +597,17 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
/// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
|
||||
/// layer will only contain the key range the user specifies, and may return `None`
|
||||
/// if there are no matching keys.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub(crate) async fn write_to_disk(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ResidentLayer> {
|
||||
key_range: Option<Range<Key>>,
|
||||
) -> Result<Option<ResidentLayer>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -623,6 +621,21 @@ impl InMemoryLayer {
|
||||
|
||||
let end_lsn = *self.end_lsn.get().unwrap();
|
||||
|
||||
let keys: Vec<_> = if let Some(key_range) = key_range {
|
||||
inner
|
||||
.index
|
||||
.iter()
|
||||
.filter(|(k, _)| key_range.contains(k))
|
||||
.map(|(k, m)| (k.to_i128(), m))
|
||||
.collect()
|
||||
} else {
|
||||
inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
|
||||
};
|
||||
|
||||
if keys.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -636,26 +649,17 @@ impl InMemoryLayer {
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
// Sort the keys because delta layer writer expects them sorted.
|
||||
//
|
||||
// NOTE: this sort can take up significant time if the layer has millions of
|
||||
// keys. To speed up all the comparisons we convert the key to i128 and
|
||||
// keep the value as a reference.
|
||||
let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
|
||||
keys.sort_unstable_by_key(|k| k.0);
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
for (key, vec_map) in keys.iter() {
|
||||
let key = Key::from_i128(*key);
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(key, *lsn, buf, will_init)
|
||||
.put_value_bytes(*key, *lsn, buf, will_init)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
@@ -663,6 +667,6 @@ impl InMemoryLayer {
|
||||
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
|
||||
Ok(delta_layer)
|
||||
Ok(Some(delta_layer))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -401,8 +401,8 @@ impl Layer {
|
||||
&self.0.path
|
||||
}
|
||||
|
||||
pub(crate) fn local_path_str(&self) -> &Arc<str> {
|
||||
&self.0.path_str
|
||||
pub(crate) fn debug_str(&self) -> &Arc<str> {
|
||||
&self.0.debug_str
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
@@ -527,8 +527,8 @@ struct LayerInner {
|
||||
/// Full path to the file; unclear if this should exist anymore.
|
||||
path: Utf8PathBuf,
|
||||
|
||||
/// String representation of the full path, used for traversal id.
|
||||
path_str: Arc<str>,
|
||||
/// String representation of the layer, used for traversal id.
|
||||
debug_str: Arc<str>,
|
||||
|
||||
desc: PersistentLayerDesc,
|
||||
|
||||
@@ -735,7 +735,7 @@ impl LayerInner {
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
path_str: path.to_string().into(),
|
||||
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
|
||||
path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
|
||||
@@ -17,7 +17,7 @@ use fail::fail_point;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
|
||||
keyspace::KeySpaceAccum,
|
||||
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
@@ -55,7 +55,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -66,6 +65,7 @@ use crate::{
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
};
|
||||
use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
|
||||
use crate::{
|
||||
disk_usage_eviction_task::finite_f32,
|
||||
tenant::storage_layer::{
|
||||
@@ -86,7 +86,7 @@ use crate::{
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::metrics::{
|
||||
GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
};
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
@@ -137,6 +137,25 @@ pub(super) enum FlushLoopState {
|
||||
Exited,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
pub enum ImageLayerCreationMode {
|
||||
/// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
|
||||
Try,
|
||||
/// Force creating the image layers if possible. For now, no image layers will be created
|
||||
/// for metadata keys. Used in compaction code path with force flag enabled.
|
||||
Force,
|
||||
/// Initial ingestion of the data, and no data should be dropped in this function. This
|
||||
/// means that no metadata keys should be included in the partitions. Used in flush frozen layer
|
||||
/// code path.
|
||||
Initial,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ImageLayerCreationMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub(crate) struct Hole {
|
||||
@@ -317,7 +336,7 @@ pub struct Timeline {
|
||||
pub initdb_lsn: Lsn,
|
||||
|
||||
/// When did we last calculate the partitioning?
|
||||
partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
|
||||
partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
|
||||
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
@@ -936,7 +955,7 @@ impl Timeline {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
|
||||
let key_count = keyspace.total_size().try_into().unwrap();
|
||||
let key_count = keyspace.total_raw_size().try_into().unwrap();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
@@ -1076,7 +1095,7 @@ impl Timeline {
|
||||
mut reconstruct_state: ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let get_kind = if keyspace.total_size() == 1 {
|
||||
let get_kind = if keyspace.total_raw_size() == 1 {
|
||||
GetKind::Singular
|
||||
} else {
|
||||
GetKind::Vectored
|
||||
@@ -1149,6 +1168,11 @@ impl Timeline {
|
||||
panic!(concat!("Sequential get failed with {}, but vectored get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
seq_err, keyspace, lsn) },
|
||||
(Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
|
||||
// Sequential get runs after vectored get, so it is possible for the later
|
||||
// to time out while waiting for its ancestor's Lsn to become ready and for the
|
||||
// former to succeed (it essentially has a doubled wait time).
|
||||
},
|
||||
(Ok(_), Err(vec_err)) => {
|
||||
panic!(concat!("Vectored get failed with {}, but sequential get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
@@ -1229,6 +1253,12 @@ impl Timeline {
|
||||
self.last_record_lsn.load()
|
||||
}
|
||||
|
||||
/// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
|
||||
/// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
|
||||
pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
|
||||
self.last_record_lsn.status_receiver()
|
||||
}
|
||||
|
||||
pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn.load()
|
||||
}
|
||||
@@ -2099,7 +2129,10 @@ impl Timeline {
|
||||
// initial logical size is 0.
|
||||
LogicalSize::empty_initial()
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||
partitioning: tokio::sync::Mutex::new((
|
||||
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
|
||||
Lsn(0),
|
||||
)),
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
|
||||
@@ -2915,7 +2948,7 @@ trait TraversalLayerExt {
|
||||
|
||||
impl TraversalLayerExt for Layer {
|
||||
fn traversal_id(&self) -> TraversalId {
|
||||
Arc::clone(self.local_path_str())
|
||||
Arc::clone(self.debug_str())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3101,7 +3134,6 @@ impl Timeline {
|
||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
drop(guard);
|
||||
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
@@ -3202,7 +3234,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3215,14 +3247,14 @@ impl Timeline {
|
||||
timeline = &*timeline_owned;
|
||||
}
|
||||
|
||||
if keyspace.total_size() != 0 {
|
||||
if keyspace.total_raw_size() != 0 {
|
||||
return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect the reconstruct data for a ketspace from the specified timeline.
|
||||
/// Collect the reconstruct data for a keyspace from the specified timeline.
|
||||
///
|
||||
/// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
|
||||
/// the current keyspace. The current keyspace of the search at any given timeline
|
||||
@@ -3651,66 +3683,103 @@ impl Timeline {
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
// repository have the same LSN.
|
||||
let lsn_range = frozen_layer.get_lsn_range();
|
||||
let (layers_to_upload, delta_layer_to_add) =
|
||||
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
|
||||
#[cfg(test)]
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
||||
panic!("flush loop not running")
|
||||
}
|
||||
FlushLoopState::Running {
|
||||
initdb_optimization_count,
|
||||
..
|
||||
} => {
|
||||
|
||||
// Whether to directly create image layers for this flush, or flush them as delta layers
|
||||
let create_image_layer =
|
||||
lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
|
||||
|
||||
#[cfg(test)]
|
||||
{
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
||||
panic!("flush loop not running")
|
||||
}
|
||||
FlushLoopState::Running {
|
||||
expect_initdb_optimization,
|
||||
initdb_optimization_count,
|
||||
..
|
||||
} => {
|
||||
if create_image_layer {
|
||||
*initdb_optimization_count += 1;
|
||||
}
|
||||
}
|
||||
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
|
||||
// require downloading anything during initial import.
|
||||
let (partitioning, _lsn) = self
|
||||
.repartition(
|
||||
self.initdb_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
EnumSet::empty(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
(
|
||||
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
|
||||
.await?,
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
#[cfg(test)]
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
||||
panic!("flush loop not running")
|
||||
}
|
||||
FlushLoopState::Running {
|
||||
expect_initdb_optimization,
|
||||
..
|
||||
} => {
|
||||
} else {
|
||||
assert!(!*expect_initdb_optimization, "expected initdb optimization");
|
||||
}
|
||||
}
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
||||
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
|
||||
(
|
||||
// FIXME: even though we have a single image and single delta layer assumption
|
||||
// we push them to vec
|
||||
vec![layer.clone()],
|
||||
Some(layer),
|
||||
}
|
||||
}
|
||||
|
||||
let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
|
||||
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
|
||||
// require downloading anything during initial import.
|
||||
let ((rel_partition, metadata_partition), _lsn) = self
|
||||
.repartition(
|
||||
self.initdb_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
EnumSet::empty(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// For metadata, always create delta layers.
|
||||
let delta_layer = if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single aux file keyspace"
|
||||
);
|
||||
let metadata_keyspace = &metadata_partition.parts[0];
|
||||
assert_eq!(
|
||||
metadata_keyspace.0.ranges.len(),
|
||||
1,
|
||||
"aux file keyspace should be a single range"
|
||||
);
|
||||
self.create_delta_layer(
|
||||
&frozen_layer,
|
||||
ctx,
|
||||
Some(metadata_keyspace.0.ranges[0].clone()),
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
&rel_partition,
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
if let Some(delta_layer) = delta_layer {
|
||||
layers_to_upload.push(delta_layer.clone());
|
||||
(layers_to_upload, Some(delta_layer))
|
||||
} else {
|
||||
(layers_to_upload, None)
|
||||
}
|
||||
} else {
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
||||
let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
|
||||
panic!("delta layer cannot be empty if no filter is applied");
|
||||
};
|
||||
(
|
||||
// FIXME: even though we have a single image and single delta layer assumption
|
||||
// we push them to vec
|
||||
vec![layer.clone()],
|
||||
Some(layer),
|
||||
)
|
||||
};
|
||||
|
||||
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -3830,12 +3899,18 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: &Arc<InMemoryLayer>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
key_range: Option<Range<Key>>,
|
||||
) -> anyhow::Result<Option<ResidentLayer>> {
|
||||
let self_clone = Arc::clone(self);
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
let ctx = ctx.attached_child();
|
||||
let work = async move {
|
||||
let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
|
||||
let Some(new_delta) = frozen_layer
|
||||
.write_to_disk(&self_clone, &ctx, key_range)
|
||||
.await?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
// The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
|
||||
// We just need to fsync the directory in which these inodes are linked,
|
||||
// which we know to be the timeline directory.
|
||||
@@ -3854,7 +3929,7 @@ impl Timeline {
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
anyhow::Ok(new_delta)
|
||||
anyhow::Ok(Some(new_delta))
|
||||
};
|
||||
// Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
|
||||
// Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
|
||||
@@ -3881,19 +3956,20 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||
) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
anyhow::bail!("repartition() called concurrently, this should not happen");
|
||||
};
|
||||
if lsn < partitioning_guard.1 {
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
if lsn < *partition_lsn {
|
||||
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
|
||||
}
|
||||
|
||||
let distance = lsn.0 - partitioning_guard.1 .0;
|
||||
if partitioning_guard.1 != Lsn(0)
|
||||
let distance = lsn.0 - partition_lsn.0;
|
||||
if *partition_lsn != Lsn(0)
|
||||
&& distance <= self.repartition_threshold
|
||||
&& !flags.contains(CompactFlags::ForceRepartition)
|
||||
{
|
||||
@@ -3902,13 +3978,18 @@ impl Timeline {
|
||||
threshold = self.repartition_threshold,
|
||||
"no repartitioning needed"
|
||||
);
|
||||
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
||||
return Ok((
|
||||
(dense_partition.clone(), sparse_partition.clone()),
|
||||
*partition_lsn,
|
||||
));
|
||||
}
|
||||
|
||||
let keyspace = self.collect_keyspace(lsn, ctx).await?;
|
||||
let partitioning = keyspace.partition(partition_size);
|
||||
|
||||
*partitioning_guard = (partitioning, lsn);
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
}; // no partitioning for metadata keys for now
|
||||
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
|
||||
|
||||
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
|
||||
}
|
||||
@@ -3964,12 +4045,12 @@ impl Timeline {
|
||||
false
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %force))]
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
||||
async fn create_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
mode: ImageLayerCreationMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
@@ -4006,19 +4087,26 @@ impl Timeline {
|
||||
for partition in partitioning.parts.iter() {
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
|
||||
let do_it = if force {
|
||||
true
|
||||
} else if check_for_image_layers {
|
||||
// [`Self::time_for_new_image_layer`] is CPU expensive,
|
||||
// so skip if we've not collected enough WAL since the last time
|
||||
self.time_for_new_image_layer(partition, lsn).await
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if !do_it {
|
||||
start = img_range.end;
|
||||
continue;
|
||||
if partition.overlaps(&Key::metadata_key_range()) {
|
||||
// TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
|
||||
// rather big change. Keep this patch small for now.
|
||||
match mode {
|
||||
ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
|
||||
// skip image layer creation anyways for metadata keys.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
ImageLayerCreationMode::Initial => {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
@@ -4059,7 +4147,7 @@ impl Timeline {
|
||||
key = key.next();
|
||||
|
||||
// Maybe flush `key_rest_accum`
|
||||
if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| last_key_in_range
|
||||
{
|
||||
let results = self
|
||||
|
||||
@@ -9,13 +9,13 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
|
||||
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
@@ -102,7 +102,7 @@ impl Timeline {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((partitioning, lsn)) => {
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
@@ -115,17 +115,37 @@ impl Timeline {
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layers = self
|
||||
let dense_layers = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
&dense_partitioning,
|
||||
lsn,
|
||||
flags.contains(CompactFlags::ForceImageLayerCreation),
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
|
||||
self.upload_new_image_layers(layers)?;
|
||||
// For now, nothing will be produced...
|
||||
let sparse_layers = self
|
||||
.create_image_layers(
|
||||
&sparse_partitioning.clone().into_dense(),
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
assert!(sparse_layers.is_empty());
|
||||
|
||||
self.upload_new_image_layers(dense_layers)?;
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -758,8 +778,9 @@ impl Timeline {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
|
||||
let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
// TODO(chi): ignore sparse_keyspace for now, compact it in the future.
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
|
||||
|
||||
pageserver_compaction::compact_tiered::compact_tiered(
|
||||
&mut adaptor,
|
||||
@@ -831,6 +852,10 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
|
||||
type RequestContext = crate::context::RequestContext;
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
self.timeline.get_shard_identity()
|
||||
}
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Key>,
|
||||
|
||||
@@ -22,10 +22,12 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use pageserver_api::models::TimelineState;
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::proto::{
|
||||
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
|
||||
SubscribeByFilterRequest, TypeSubscription, TypedMessage,
|
||||
};
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -89,6 +91,14 @@ pub(super) async fn connection_manager_loop_step(
|
||||
.timeline
|
||||
.subscribe_for_state_updates();
|
||||
|
||||
let mut wait_lsn_status = connection_manager_state
|
||||
.timeline
|
||||
.subscribe_for_wait_lsn_updates();
|
||||
|
||||
// TODO: create a separate config option for discovery request interval
|
||||
let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
|
||||
let mut last_discovery_ts: Option<std::time::Instant> = None;
|
||||
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
@@ -97,10 +107,12 @@ pub(super) async fn connection_manager_loop_step(
|
||||
|
||||
loop {
|
||||
let time_until_next_retry = connection_manager_state.time_until_next_retry();
|
||||
let any_activity = connection_manager_state.wal_connection.is_some()
|
||||
|| !connection_manager_state.wal_stream_candidates.is_empty();
|
||||
|
||||
// These things are happening concurrently:
|
||||
//
|
||||
// - cancellation request
|
||||
// - cancellation request
|
||||
// - keep receiving WAL on the current connection
|
||||
// - if the shared state says we need to change connection, disconnect and return
|
||||
// - this runs in a separate task and we receive updates via a watch channel
|
||||
@@ -108,6 +120,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// - receive updates from broker
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
// - if there's no connection and no candidates, try to send a discovery request
|
||||
|
||||
// NB: make sure each of the select expressions are cancellation-safe
|
||||
// (no need for arms to be cancellation-safe).
|
||||
@@ -214,6 +227,65 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
}
|
||||
} => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
|
||||
|
||||
Some(()) = async {
|
||||
// Reminder: this match arm needs to be cancellation-safe.
|
||||
// Calculating time needed to wait until sending the next discovery request.
|
||||
// Current implementation is conservative and sends discovery requests only when there are no candidates.
|
||||
|
||||
if any_activity {
|
||||
// No need to send discovery requests if there is an active connection or candidates.
|
||||
return None;
|
||||
}
|
||||
|
||||
// Waiting for an active wait_lsn request.
|
||||
while wait_lsn_status.borrow().is_none() {
|
||||
if wait_lsn_status.changed().await.is_err() {
|
||||
// wait_lsn_status channel was closed, exiting
|
||||
warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// All preconditions met, preparing to send a discovery request.
|
||||
let now = std::time::Instant::now();
|
||||
let next_discovery_ts = last_discovery_ts
|
||||
.map(|ts| ts + discovery_request_interval)
|
||||
.unwrap_or_else(|| now);
|
||||
|
||||
if next_discovery_ts > now {
|
||||
// Prevent sending discovery requests too frequently.
|
||||
tokio::time::sleep(next_discovery_ts - now).await;
|
||||
}
|
||||
|
||||
let tenant_timeline_id = Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
|
||||
let msg = TypedMessage {
|
||||
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
|
||||
safekeeper_timeline_info: None,
|
||||
safekeeper_discovery_request: Some(request),
|
||||
safekeeper_discovery_response: None,
|
||||
};
|
||||
|
||||
last_discovery_ts = Some(std::time::Instant::now());
|
||||
debug!("No active connection and no candidates, sending discovery request to the broker");
|
||||
|
||||
// Cancellation safety: we want to send a message to the broker, but publish_one()
|
||||
// function can get cancelled by the other select! arm. This is absolutely fine, because
|
||||
// we just want to receive broker updates and discovery is not important if we already
|
||||
// receive updates.
|
||||
//
|
||||
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
|
||||
// This is totally fine because of the reason above.
|
||||
|
||||
// This is a fire-and-forget request, we don't care about the response
|
||||
let _ = broker_client.publish_one(msg).await;
|
||||
debug!("Discovery request sent to the broker");
|
||||
None
|
||||
} => {}
|
||||
}
|
||||
|
||||
if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
|
||||
@@ -231,7 +303,7 @@ async fn subscribe_for_timeline_updates(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
|
||||
) -> Result<Streaming<TypedMessage>, Cancelled> {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
exponential_backoff(
|
||||
@@ -244,17 +316,27 @@ async fn subscribe_for_timeline_updates(
|
||||
attempt += 1;
|
||||
|
||||
// subscribe to the specific timeline
|
||||
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
subscription_key: Some(key),
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperTimelineInfo as i32,
|
||||
},
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
|
||||
},
|
||||
],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: true,
|
||||
tenant_timeline_id: Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
}),
|
||||
}),
|
||||
};
|
||||
|
||||
match {
|
||||
tokio::select! {
|
||||
r = broker_client.subscribe_safekeeper_info(request) => { r }
|
||||
r = broker_client.subscribe_by_filter(request) => { r }
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
}
|
||||
} {
|
||||
@@ -398,7 +480,7 @@ struct RetryInfo {
|
||||
/// Data about the timeline to connect to, received from the broker.
|
||||
#[derive(Debug, Clone)]
|
||||
struct BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo,
|
||||
timeline: SafekeeperDiscoveryResponse,
|
||||
/// Time at which the data was fetched from the broker last time, to track the stale data.
|
||||
latest_update: NaiveDateTime,
|
||||
}
|
||||
@@ -606,7 +688,41 @@ impl ConnectionManagerState {
|
||||
}
|
||||
|
||||
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
|
||||
fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
|
||||
let mut is_discovery = false;
|
||||
let timeline_update = match typed_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => {
|
||||
let info = match typed_msg.safekeeper_timeline_info {
|
||||
Some(info) => info,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_timeline_info");
|
||||
return;
|
||||
}
|
||||
};
|
||||
SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: info.safekeeper_id,
|
||||
tenant_timeline_id: info.tenant_timeline_id,
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
is_discovery = true;
|
||||
match typed_msg.safekeeper_discovery_response {
|
||||
Some(response) => response,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_discovery_response");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// unexpected message
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
WALRECEIVER_BROKER_UPDATES.inc();
|
||||
|
||||
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
|
||||
@@ -619,7 +735,11 @@ impl ConnectionManagerState {
|
||||
);
|
||||
|
||||
if old_entry.is_none() {
|
||||
info!("New SK node was added: {new_safekeeper_id}");
|
||||
info!(
|
||||
?is_discovery,
|
||||
%new_safekeeper_id,
|
||||
"New SK node was added",
|
||||
);
|
||||
WALRECEIVER_CANDIDATES_ADDED.inc();
|
||||
}
|
||||
}
|
||||
@@ -818,7 +938,7 @@ impl ConnectionManagerState {
|
||||
fn select_connection_candidate(
|
||||
&self,
|
||||
node_to_omit: Option<NodeId>,
|
||||
) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
|
||||
self.applicable_connection_candidates()
|
||||
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
|
||||
.max_by_key(|(_, info, _)| info.commit_lsn)
|
||||
@@ -828,7 +948,7 @@ impl ConnectionManagerState {
|
||||
/// Some safekeepers are filtered by the retry cooldown.
|
||||
fn applicable_connection_candidates(
|
||||
&self,
|
||||
) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
self.wal_stream_candidates
|
||||
@@ -968,19 +1088,11 @@ mod tests {
|
||||
latest_update: NaiveDateTime,
|
||||
) -> BrokerSkTimeline {
|
||||
BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo {
|
||||
timeline: SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: 0,
|
||||
tenant_timeline_id: None,
|
||||
term: 0,
|
||||
last_log_term: 0,
|
||||
flush_lsn: 0,
|
||||
commit_lsn,
|
||||
backup_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
peer_horizon_lsn: 0,
|
||||
local_start_lsn: 0,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
http_connstr: safekeeper_connstr.to_owned(),
|
||||
availability_zone: None,
|
||||
},
|
||||
latest_update,
|
||||
|
||||
@@ -22,7 +22,12 @@ serde_with.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
utils.workspace = true
|
||||
async-stream.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util = { workspace = true }
|
||||
futures-util.workspace = true
|
||||
itertools.workspace = true
|
||||
camino.workspace = true
|
||||
|
||||
@@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted.
|
||||
|
||||
#### `scan-metadata`
|
||||
|
||||
Walk objects in a pageserver S3 bucket, and report statistics on the contents.
|
||||
Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
|
||||
Errors are logged to stderr and summary to stdout.
|
||||
|
||||
For pageserver:
|
||||
```
|
||||
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
|
||||
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
|
||||
|
||||
Timelines: 31106
|
||||
With errors: 3
|
||||
@@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
|
||||
Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
|
||||
```
|
||||
|
||||
For safekeepers, dump_db_connstr and dump_db_table must be
|
||||
specified; they should point to table with debug dump which will be used
|
||||
to list timelines and find their backup and start LSNs.
|
||||
|
||||
## Cleaning up running pageservers
|
||||
|
||||
If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::Future;
|
||||
use hex::FromHex;
|
||||
|
||||
use reqwest::{header, Client, StatusCode, Url};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -137,7 +139,7 @@ pub struct ProjectData {
|
||||
pub region_id: String,
|
||||
pub platform_id: String,
|
||||
pub user_id: String,
|
||||
pub pageserver_id: u64,
|
||||
pub pageserver_id: Option<u64>,
|
||||
#[serde(deserialize_with = "from_nullable_id")]
|
||||
pub tenant: TenantId,
|
||||
pub safekeepers: Vec<SafekeeperData>,
|
||||
@@ -155,7 +157,7 @@ pub struct ProjectData {
|
||||
pub maintenance_set: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub struct BranchData {
|
||||
pub id: BranchId,
|
||||
pub created_at: DateTime<Utc>,
|
||||
@@ -210,30 +212,39 @@ impl CloudAdminApiClient {
|
||||
.await
|
||||
.expect("Semaphore is not closed");
|
||||
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("tenant_id", tenant_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
let response = CloudAdminApiClient::with_retries(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("tenant_id", tenant_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let response: AdminApiResponse<Vec<ProjectData>> =
|
||||
response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
Ok(response)
|
||||
},
|
||||
"find_tenant_project",
|
||||
)
|
||||
.await?;
|
||||
|
||||
let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find project for tenant".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
match response.data.len() {
|
||||
0 => Ok(None),
|
||||
1 => Ok(Some(
|
||||
@@ -261,42 +272,34 @@ impl CloudAdminApiClient {
|
||||
const PAGINATION_LIMIT: usize = 512;
|
||||
let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
|
||||
loop {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("show_deleted", "false".to_string()),
|
||||
("limit", format!("{PAGINATION_LIMIT}")),
|
||||
("offset", format!("{pagination_offset}")),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"List active projects".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
let response_bytes = CloudAdminApiClient::with_retries(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/projects"))
|
||||
.query(&[
|
||||
("show_deleted", "false".to_string()),
|
||||
("limit", format!("{PAGINATION_LIMIT}")),
|
||||
("offset", format!("{pagination_offset}")),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"List active projects".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
|
||||
match response.status() {
|
||||
StatusCode::OK => {}
|
||||
StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
continue;
|
||||
}
|
||||
_status => {
|
||||
return Err(Error::new(
|
||||
"List active projects".to_string(),
|
||||
ErrorKind::ResponseStatus(response.status()),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
let response_bytes = response.bytes().await.map_err(|e| {
|
||||
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
|
||||
})?;
|
||||
response.bytes().await.map_err(|e| {
|
||||
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
|
||||
})
|
||||
},
|
||||
"list_projects",
|
||||
)
|
||||
.await?;
|
||||
|
||||
let decode_result =
|
||||
serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
|
||||
@@ -327,6 +330,7 @@ impl CloudAdminApiClient {
|
||||
|
||||
pub async fn find_timeline_branch(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Option<BranchData>, Error> {
|
||||
let _permit = self
|
||||
@@ -335,43 +339,61 @@ impl CloudAdminApiClient {
|
||||
.await
|
||||
.expect("Semaphore is not closed");
|
||||
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/branches"))
|
||||
.query(&[
|
||||
("timeline_id", timeline_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
let response = CloudAdminApiClient::with_retries(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.get(self.append_url("/branches"))
|
||||
.query(&[
|
||||
("timeline_id", timeline_id.to_string()),
|
||||
("show_deleted", "true".to_string()),
|
||||
])
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.bearer_auth(&self.token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::RequestSend(e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
match response.data.len() {
|
||||
0 => Ok(None),
|
||||
1 => Ok(Some(
|
||||
response
|
||||
.data
|
||||
.into_iter()
|
||||
.next()
|
||||
.expect("Should have exactly one element"),
|
||||
)),
|
||||
too_many => Err(Error::new(
|
||||
format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
|
||||
let response: AdminApiResponse<Vec<BranchData>> =
|
||||
response.json().await.map_err(|e| {
|
||||
Error::new(
|
||||
"Find branch for timeline".to_string(),
|
||||
ErrorKind::BodyRead(e),
|
||||
)
|
||||
})?;
|
||||
Ok(response)
|
||||
},
|
||||
"find_timeline_branch",
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut branches: Vec<BranchData> = response.data.into_iter().collect();
|
||||
// Normally timeline_id is unique. However, we do have at least one case
|
||||
// of the same timeline_id in two different projects, apparently after
|
||||
// manual recovery. So always recheck project_id (discovered through
|
||||
// tenant_id).
|
||||
let project_data = match self.find_tenant_project(tenant_id).await? {
|
||||
Some(pd) => pd,
|
||||
None => return Ok(None),
|
||||
};
|
||||
branches.retain(|b| b.project_id == project_data.id);
|
||||
if branches.len() < 2 {
|
||||
Ok(branches.first().cloned())
|
||||
} else {
|
||||
Err(Error::new(
|
||||
format!(
|
||||
"Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
branches.len()
|
||||
),
|
||||
ErrorKind::UnexpectedState,
|
||||
)),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,4 +554,15 @@ impl CloudAdminApiClient {
|
||||
.parse()
|
||||
.unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
|
||||
}
|
||||
|
||||
async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, Error>>,
|
||||
{
|
||||
let cancel = CancellationToken::new(); // not really used
|
||||
backoff::retry(op, |_| false, 1, 20, description, &cancel)
|
||||
.await
|
||||
.expect("cancellations are disabled")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +60,7 @@ pub struct GarbageList {
|
||||
/// see garbage, we saw some active tenants too. This protects against classes of bugs
|
||||
/// in the scrubber that might otherwise generate a "deleted all" result.
|
||||
active_tenant_count: usize,
|
||||
active_timeline_count: usize,
|
||||
}
|
||||
|
||||
impl GarbageList {
|
||||
@@ -67,6 +68,7 @@ impl GarbageList {
|
||||
Self {
|
||||
items: Vec::new(),
|
||||
active_tenant_count: 0,
|
||||
active_timeline_count: 0,
|
||||
node_kind,
|
||||
bucket_config,
|
||||
}
|
||||
@@ -119,7 +121,10 @@ pub async fn find_garbage(
|
||||
const S3_CONCURRENCY: usize = 32;
|
||||
|
||||
// How many concurrent API requests to make to the console API.
|
||||
const CONSOLE_CONCURRENCY: usize = 128;
|
||||
//
|
||||
// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
|
||||
// would be better to implement real rsp limiter.
|
||||
const CONSOLE_CONCURRENCY: usize = 16;
|
||||
|
||||
struct ConsoleCache {
|
||||
/// Set of tenants found in the control plane API
|
||||
@@ -221,6 +226,7 @@ async fn find_garbage_inner(
|
||||
} else {
|
||||
tracing::debug!("Tenant {tenant_shard_id} is active");
|
||||
active_tenants.push(tenant_shard_id);
|
||||
garbage.active_tenant_count = active_tenants.len();
|
||||
}
|
||||
|
||||
counter += 1;
|
||||
@@ -261,7 +267,7 @@ async fn find_garbage_inner(
|
||||
let api_client = cloud_admin_api_client.clone();
|
||||
async move {
|
||||
api_client
|
||||
.find_timeline_branch(ttid.timeline_id)
|
||||
.find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))
|
||||
.map(|r| (ttid, r))
|
||||
@@ -271,15 +277,29 @@ async fn find_garbage_inner(
|
||||
std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
|
||||
|
||||
// Update the GarbageList with any timelines which appear not to exist.
|
||||
let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
|
||||
while let Some(result) = timelines_checked.next().await {
|
||||
let (ttid, console_result) = result?;
|
||||
if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
|
||||
tracing::debug!("Timeline {ttid} is garbage");
|
||||
} else {
|
||||
tracing::debug!("Timeline {ttid} is active");
|
||||
active_timelines.push(ttid);
|
||||
garbage.active_timeline_count = active_timelines.len();
|
||||
}
|
||||
}
|
||||
|
||||
let num_garbage_timelines = garbage
|
||||
.items
|
||||
.iter()
|
||||
.filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
|
||||
.count();
|
||||
tracing::info!(
|
||||
"Found {}/{} garbage timelines in active tenants",
|
||||
num_garbage_timelines,
|
||||
active_timelines.len(),
|
||||
);
|
||||
|
||||
Ok(garbage)
|
||||
}
|
||||
|
||||
@@ -344,16 +364,22 @@ pub async fn get_timeline_objects(
|
||||
const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
|
||||
/// Drain a buffer of keys into DeleteObjects requests
|
||||
///
|
||||
/// If `drain` is true, drains keys completely; otherwise stops when <
|
||||
/// MAX_KEYS_PER_DELETE keys are left.
|
||||
/// `num_deleted` returns number of deleted keys.
|
||||
async fn do_delete(
|
||||
s3_client: &Arc<Client>,
|
||||
bucket_name: &str,
|
||||
keys: &mut Vec<ObjectIdentifier>,
|
||||
dry_run: bool,
|
||||
drain: bool,
|
||||
progress_tracker: &mut DeletionProgressTracker,
|
||||
) -> anyhow::Result<()> {
|
||||
while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
|
||||
let request_keys =
|
||||
keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
|
||||
let num_deleted = request_keys.len();
|
||||
if dry_run {
|
||||
tracing::info!("Dry-run deletion of objects: ");
|
||||
for k in request_keys {
|
||||
@@ -368,12 +394,30 @@ async fn do_delete(
|
||||
.send()
|
||||
.await
|
||||
.context("DeleteObjects request")?;
|
||||
progress_tracker.register(num_deleted);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Simple tracker reporting each 10k deleted keys.
|
||||
#[derive(Default)]
|
||||
struct DeletionProgressTracker {
|
||||
num_deleted: usize,
|
||||
last_reported_num_deleted: usize,
|
||||
}
|
||||
|
||||
impl DeletionProgressTracker {
|
||||
fn register(&mut self, n: usize) {
|
||||
self.num_deleted += n;
|
||||
if self.num_deleted - self.last_reported_num_deleted > 10000 {
|
||||
tracing::info!("progress: deleted {} keys", self.num_deleted);
|
||||
self.last_reported_num_deleted = self.num_deleted;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn purge_garbage(
|
||||
input_path: String,
|
||||
mode: PurgeMode,
|
||||
@@ -394,6 +438,14 @@ pub async fn purge_garbage(
|
||||
if garbage_list.active_tenant_count == 0 {
|
||||
anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
|
||||
}
|
||||
if garbage_list
|
||||
.items
|
||||
.iter()
|
||||
.any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
|
||||
&& garbage_list.active_timeline_count == 0
|
||||
{
|
||||
anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
|
||||
}
|
||||
|
||||
let filtered_items = garbage_list
|
||||
.items
|
||||
@@ -429,6 +481,7 @@ pub async fn purge_garbage(
|
||||
std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
|
||||
|
||||
let mut objects_to_delete = Vec::new();
|
||||
let mut progress_tracker = DeletionProgressTracker::default();
|
||||
while let Some(result) = get_objects_results.next().await {
|
||||
let mut object_list = result?;
|
||||
objects_to_delete.append(&mut object_list);
|
||||
@@ -439,6 +492,7 @@ pub async fn purge_garbage(
|
||||
&mut objects_to_delete,
|
||||
dry_run,
|
||||
false,
|
||||
&mut progress_tracker,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -450,10 +504,11 @@ pub async fn purge_garbage(
|
||||
&mut objects_to_delete,
|
||||
dry_run,
|
||||
true,
|
||||
&mut progress_tracker,
|
||||
)
|
||||
.await?;
|
||||
|
||||
tracing::info!("Fell through");
|
||||
tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -4,7 +4,8 @@ pub mod checks;
|
||||
pub mod cloud_admin_api;
|
||||
pub mod garbage;
|
||||
pub mod metadata_stream;
|
||||
pub mod scan_metadata;
|
||||
pub mod scan_pageserver_metadata;
|
||||
pub mod scan_safekeeper_metadata;
|
||||
pub mod tenant_snapshot;
|
||||
|
||||
use std::env;
|
||||
@@ -141,12 +142,17 @@ impl RootTarget {
|
||||
pub fn tenants_root(&self) -> S3Target {
|
||||
match self {
|
||||
Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
|
||||
Self::Safekeeper(root) => root.with_sub_segment("wal"),
|
||||
Self::Safekeeper(root) => root.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
|
||||
self.tenants_root().with_sub_segment(&tenant_id.to_string())
|
||||
match self {
|
||||
Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
|
||||
Self::Safekeeper(_) => self
|
||||
.tenants_root()
|
||||
.with_sub_segment(&tenant_id.tenant_id.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
|
||||
@@ -337,9 +343,7 @@ fn init_remote(
|
||||
}),
|
||||
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
|
||||
bucket_name: bucket_config.bucket,
|
||||
prefix_in_bucket: bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("safekeeper/v1".to_string()),
|
||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
|
||||
delimiter,
|
||||
}),
|
||||
};
|
||||
@@ -364,7 +368,10 @@ async fn list_objects_with_retries(
|
||||
{
|
||||
Ok(response) => return Ok(response),
|
||||
Err(e) => {
|
||||
error!("list_objects_v2 query failed: {e}");
|
||||
error!(
|
||||
"list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
|
||||
s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
|
||||
);
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
use anyhow::bail;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use s3_scrubber::scan_metadata::scan_metadata;
|
||||
use s3_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
use s3_scrubber::tenant_snapshot::SnapshotDownloader;
|
||||
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
|
||||
use s3_scrubber::{
|
||||
init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
|
||||
NodeKind, TraversingDepth,
|
||||
};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use utils::id::TenantId;
|
||||
@@ -35,11 +39,20 @@ enum Command {
|
||||
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
|
||||
mode: PurgeMode,
|
||||
},
|
||||
#[command(verbatim_doc_comment)]
|
||||
ScanMetadata {
|
||||
#[arg(short, long)]
|
||||
node_kind: NodeKind,
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
json: bool,
|
||||
#[arg(long = "tenant-id", num_args = 0..)]
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
#[arg(long, default_value = None)]
|
||||
/// For safekeeper node_kind only, points to db with debug dump
|
||||
dump_db_connstr: Option<String>,
|
||||
/// For safekeeper node_kind only, table in the db with debug dump
|
||||
#[arg(long, default_value = None)]
|
||||
dump_db_table: Option<String>,
|
||||
},
|
||||
TenantSnapshot {
|
||||
#[arg(long = "tenant-id")]
|
||||
@@ -72,33 +85,75 @@ async fn main() -> anyhow::Result<()> {
|
||||
));
|
||||
|
||||
match cli.command {
|
||||
Command::ScanMetadata { json, tenant_ids } => {
|
||||
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
Command::ScanMetadata {
|
||||
json,
|
||||
tenant_ids,
|
||||
node_kind,
|
||||
dump_db_connstr,
|
||||
dump_db_table,
|
||||
} => {
|
||||
if let NodeKind::Safekeeper = node_kind {
|
||||
let dump_db_connstr =
|
||||
dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
|
||||
let dump_db_table =
|
||||
dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
|
||||
|
||||
let summary = scan_safekeeper_metadata(
|
||||
bucket_config.clone(),
|
||||
tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
|
||||
dump_db_connstr,
|
||||
dump_db_table,
|
||||
)
|
||||
.await?;
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
Ok(summary) => {
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
if summary.is_fatal() {
|
||||
bail!("Fatal scrub errors detected");
|
||||
}
|
||||
if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
bail!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
}
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
Err(anyhow::anyhow!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
Ok(summary) => {
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
Err(anyhow::anyhow!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>(
|
||||
let timelines_target = target.timelines_root(&tenant);
|
||||
|
||||
loop {
|
||||
tracing::info!("Listing in {}", tenant);
|
||||
tracing::debug!("Listing in {}", tenant);
|
||||
let fetch_response =
|
||||
list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
|
||||
.await;
|
||||
@@ -151,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Yielding for {}", tenant);
|
||||
tracing::debug!("Yielding for {}", tenant);
|
||||
Ok(stream! {
|
||||
for i in timeline_ids {
|
||||
let id = i?;
|
||||
|
||||
236
s3_scrubber/src/scan_safekeeper_metadata.rs
Normal file
236
s3_scrubber/src/scan_safekeeper_metadata.rs
Normal file
@@ -0,0 +1,236 @@
|
||||
use std::{collections::HashSet, str::FromStr};
|
||||
|
||||
use aws_sdk_s3::Client;
|
||||
use futures::stream::{StreamExt, TryStreamExt};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::{XLogFileName, PG_TLI};
|
||||
use serde::Serialize;
|
||||
use tokio_postgres::types::PgLsn;
|
||||
use tracing::{error, info, trace};
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
|
||||
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
|
||||
};
|
||||
|
||||
/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
|
||||
const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct MetadataSummary {
|
||||
timeline_count: usize,
|
||||
with_errors: HashSet<TenantTimelineId>,
|
||||
deleted_count: usize,
|
||||
}
|
||||
|
||||
impl MetadataSummary {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
timeline_count: 0,
|
||||
with_errors: HashSet::new(),
|
||||
deleted_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn summary_string(&self) -> String {
|
||||
format!(
|
||||
"timeline_count: {}, with_errors: {}",
|
||||
self.timeline_count,
|
||||
self.with_errors.len()
|
||||
)
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.timeline_count == 0
|
||||
}
|
||||
|
||||
pub fn is_fatal(&self) -> bool {
|
||||
!self.with_errors.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
|
||||
/// statistics.
|
||||
///
|
||||
/// It works by listing timelines along with timeline_start_lsn and backup_lsn
|
||||
/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
|
||||
/// segments are missing, before complaining control plane is queried to check if
|
||||
/// the project wasn't deleted in the meanwhile.
|
||||
pub async fn scan_safekeeper_metadata(
|
||||
bucket_config: BucketConfig,
|
||||
tenant_ids: Vec<TenantId>,
|
||||
dump_db_connstr: String,
|
||||
dump_db_table: String,
|
||||
) -> anyhow::Result<MetadataSummary> {
|
||||
info!(
|
||||
"checking bucket {}, region {}, dump_db_table {}",
|
||||
bucket_config.bucket, bucket_config.region, dump_db_table
|
||||
);
|
||||
// Use the native TLS implementation (Neon requires TLS)
|
||||
let tls_connector =
|
||||
postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
|
||||
let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let tenant_filter_clause = if !tenant_ids.is_empty() {
|
||||
format!(
|
||||
"and tenant_id in ({})",
|
||||
tenant_ids
|
||||
.iter()
|
||||
.map(|t| format!("'{}'", t))
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
)
|
||||
} else {
|
||||
"".to_owned()
|
||||
};
|
||||
let query = format!(
|
||||
"select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
|
||||
dump_db_table, tenant_filter_clause,
|
||||
);
|
||||
info!("query is {}", query);
|
||||
let timelines = client.query(&query, &[]).await?;
|
||||
info!("loaded {} timelines", timelines.len());
|
||||
|
||||
let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
|
||||
let console_config = ConsoleConfig::from_env()?;
|
||||
let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
|
||||
|
||||
let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
|
||||
let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
|
||||
let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
|
||||
let timeline_start_lsn_pg: PgLsn = row.get(2);
|
||||
let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
|
||||
let backup_lsn_pg: PgLsn = row.get(3);
|
||||
let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
check_timeline(
|
||||
&s3_client,
|
||||
&target,
|
||||
&cloud_admin_api_client,
|
||||
ttid,
|
||||
timeline_start_lsn,
|
||||
backup_lsn,
|
||||
)
|
||||
});
|
||||
// Run multiple check_timeline's concurrently.
|
||||
const CONCURRENCY: usize = 32;
|
||||
let mut timelines = checks.try_buffered(CONCURRENCY);
|
||||
|
||||
let mut summary = MetadataSummary::new();
|
||||
while let Some(r) = timelines.next().await {
|
||||
let res = r?;
|
||||
summary.timeline_count += 1;
|
||||
if !res.is_ok {
|
||||
summary.with_errors.insert(res.ttid);
|
||||
}
|
||||
if res.is_deleted {
|
||||
summary.deleted_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
struct TimelineCheckResult {
|
||||
ttid: TenantTimelineId,
|
||||
is_ok: bool,
|
||||
is_deleted: bool, // timeline is deleted in cplane
|
||||
}
|
||||
|
||||
/// List s3 and check that is has all expected WAL for the ttid. Consistency
|
||||
/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
|
||||
/// Ok(false) if not, Err if failed to check.
|
||||
async fn check_timeline(
|
||||
s3_client: &Client,
|
||||
root: &RootTarget,
|
||||
api_client: &CloudAdminApiClient,
|
||||
ttid: TenantTimelineId,
|
||||
timeline_start_lsn: Lsn,
|
||||
backup_lsn: Lsn,
|
||||
) -> anyhow::Result<TimelineCheckResult> {
|
||||
trace!(
|
||||
"checking ttid {}, should contain WAL [{}-{}]",
|
||||
ttid,
|
||||
timeline_start_lsn,
|
||||
backup_lsn
|
||||
);
|
||||
// calculate expected segfiles
|
||||
let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
|
||||
let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
|
||||
let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
|
||||
(expected_first_segno..expected_last_segno)
|
||||
.map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
|
||||
);
|
||||
let expected_files_num = expected_segfiles.len();
|
||||
trace!("expecting {} files", expected_segfiles.len(),);
|
||||
|
||||
// now list s3 and check if it misses something
|
||||
let ttshid =
|
||||
TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
|
||||
let mut timeline_dir_target = root.timeline_root(&ttshid);
|
||||
// stream_listing yields only common_prefixes if delimiter is not empty, but
|
||||
// we need files, so unset it.
|
||||
timeline_dir_target.delimiter = String::new();
|
||||
|
||||
let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
|
||||
while let Some(obj) = stream.next().await {
|
||||
let obj = obj?;
|
||||
let key = obj.key();
|
||||
|
||||
let seg_name = key
|
||||
.strip_prefix(&timeline_dir_target.prefix_in_bucket)
|
||||
.expect("failed to extract segment name");
|
||||
expected_segfiles.remove(seg_name);
|
||||
}
|
||||
if !expected_segfiles.is_empty() {
|
||||
// Before complaining check cplane, probably timeline is already deleted.
|
||||
let bdata = api_client
|
||||
.find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
|
||||
.await?;
|
||||
let deleted = match bdata {
|
||||
Some(bdata) => bdata.deleted,
|
||||
None => {
|
||||
// note: should be careful with selecting proper cplane address
|
||||
info!("ttid {} not found, assuming it is deleted", ttid);
|
||||
true
|
||||
}
|
||||
};
|
||||
if deleted {
|
||||
// ok, branch is deleted
|
||||
return Ok(TimelineCheckResult {
|
||||
ttid,
|
||||
is_ok: true,
|
||||
is_deleted: true,
|
||||
});
|
||||
}
|
||||
error!(
|
||||
"ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
|
||||
ttid,
|
||||
expected_segfiles.len(),
|
||||
expected_files_num,
|
||||
timeline_start_lsn,
|
||||
backup_lsn,
|
||||
);
|
||||
return Ok(TimelineCheckResult {
|
||||
ttid,
|
||||
is_ok: false,
|
||||
is_deleted: false,
|
||||
});
|
||||
}
|
||||
Ok(TimelineCheckResult {
|
||||
ttid,
|
||||
is_ok: true,
|
||||
is_deleted: false,
|
||||
})
|
||||
}
|
||||
@@ -177,6 +177,10 @@ struct Args {
|
||||
/// Controls how long backup will wait until uploading the partial segment.
|
||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
|
||||
partial_backup_timeout: Duration,
|
||||
/// Disable task to push messages to broker every second. Supposed to
|
||||
/// be used in tests.
|
||||
#[arg(long)]
|
||||
disable_periodic_broker_push: bool,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
@@ -309,6 +313,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
walsenders_keep_horizon: args.walsenders_keep_horizon,
|
||||
partial_backup_enabled: args.partial_backup_enabled,
|
||||
partial_backup_timeout: args.partial_backup_timeout,
|
||||
disable_periodic_broker_push: args.disable_periodic_broker_push,
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
|
||||
@@ -10,11 +10,20 @@ use anyhow::Result;
|
||||
use storage_broker::parse_proto_ttid;
|
||||
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
|
||||
use storage_broker::proto::FilterTenantTimelineId;
|
||||
use storage_broker::proto::MessageType;
|
||||
use storage_broker::proto::SafekeeperDiscoveryResponse;
|
||||
use storage_broker::proto::SubscribeByFilterRequest;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::proto::TypeSubscription;
|
||||
use storage_broker::proto::TypedMessage;
|
||||
use storage_broker::Request;
|
||||
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::time::UNIX_EPOCH;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::sleep;
|
||||
use tracing::*;
|
||||
@@ -31,6 +40,12 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||
|
||||
/// Push once in a while data about all active timelines to the broker.
|
||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
if conf.disable_periodic_broker_push {
|
||||
info!("broker push_loop is disabled, doing nothing...");
|
||||
futures::future::pending::<()>().await; // sleep forever
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
@@ -75,7 +90,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
/// Subscribe and fetch all the interesting data from the broker.
|
||||
async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
|
||||
|
||||
// TODO: subscribe only to local timelines instead of all
|
||||
@@ -94,6 +109,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);
|
||||
|
||||
while let Some(msg) = stream.message().await? {
|
||||
stats.update_pulled();
|
||||
|
||||
let proto_ttid = msg
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
@@ -119,12 +136,93 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
bail!("end of stream");
|
||||
}
|
||||
|
||||
/// Process incoming discover requests. This is done in a separate task to avoid
|
||||
/// interfering with the normal pull/push loops.
|
||||
async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![TypeSubscription {
|
||||
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
|
||||
}],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: false,
|
||||
tenant_timeline_id: None,
|
||||
}),
|
||||
};
|
||||
|
||||
let mut stream = client
|
||||
.subscribe_by_filter(request)
|
||||
.await
|
||||
.context("subscribe_by_filter request failed")?
|
||||
.into_inner();
|
||||
|
||||
let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
|
||||
|
||||
while let Some(typed_msg) = stream.message().await? {
|
||||
stats.update_pulled();
|
||||
|
||||
match typed_msg.r#type() {
|
||||
MessageType::SafekeeperDiscoveryRequest => {
|
||||
let msg = typed_msg
|
||||
.safekeeper_discovery_request
|
||||
.expect("proto type mismatch from broker message");
|
||||
|
||||
let proto_ttid = msg
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
|
||||
let ttid = parse_proto_ttid(proto_ttid)?;
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
// we received a discovery request for a timeline we know about
|
||||
discover_counter.inc();
|
||||
|
||||
// create and reply with discovery response
|
||||
let sk_info = tli.get_safekeeper_info(&conf).await;
|
||||
let response = SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: sk_info.safekeeper_id,
|
||||
tenant_timeline_id: sk_info.tenant_timeline_id,
|
||||
commit_lsn: sk_info.commit_lsn,
|
||||
safekeeper_connstr: sk_info.safekeeper_connstr,
|
||||
availability_zone: sk_info.availability_zone,
|
||||
};
|
||||
|
||||
// note this is a blocking call
|
||||
client
|
||||
.publish_one(TypedMessage {
|
||||
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
|
||||
safekeeper_timeline_info: None,
|
||||
safekeeper_discovery_request: None,
|
||||
safekeeper_discovery_response: Some(response),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
_ => {
|
||||
warn!(
|
||||
"unexpected message type i32 {}, {:?}",
|
||||
typed_msg.r#type,
|
||||
typed_msg.r#type()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("end of stream");
|
||||
}
|
||||
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
info!("started, broker endpoint {:?}", conf.broker_endpoint);
|
||||
|
||||
let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
|
||||
let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
|
||||
let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
|
||||
let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
|
||||
|
||||
let stats = Arc::new(BrokerStats::new());
|
||||
let stats_task = task_stats(stats.clone());
|
||||
tokio::pin!(stats_task);
|
||||
|
||||
// Selecting on JoinHandles requires some squats; is there a better way to
|
||||
// reap tasks individually?
|
||||
@@ -153,13 +251,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
};
|
||||
pull_handle = None;
|
||||
},
|
||||
res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
|
||||
// was it panic or normal error?
|
||||
match res {
|
||||
Ok(res_internal) => if let Err(err_inner) = res_internal {
|
||||
warn!("discover task failed: {:?}", err_inner);
|
||||
}
|
||||
Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
|
||||
};
|
||||
discover_handle = None;
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
if push_handle.is_none() {
|
||||
push_handle = Some(tokio::spawn(push_loop(conf.clone())));
|
||||
}
|
||||
if pull_handle.is_none() {
|
||||
pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
|
||||
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
|
||||
}
|
||||
if discover_handle.is_none() {
|
||||
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
|
||||
}
|
||||
},
|
||||
_ = &mut stats_task => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct BrokerStats {
|
||||
/// Timestamp of the last received message from the broker.
|
||||
last_pulled_ts: AtomicU64,
|
||||
}
|
||||
|
||||
impl BrokerStats {
|
||||
fn new() -> Self {
|
||||
BrokerStats {
|
||||
last_pulled_ts: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
fn now_millis() -> u64 {
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("time is before epoch")
|
||||
.as_millis() as u64
|
||||
}
|
||||
|
||||
/// Update last_pulled timestamp to current time.
|
||||
fn update_pulled(&self) {
|
||||
self.last_pulled_ts
|
||||
.store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Periodically write to logs if there are issues with receiving data from the broker.
|
||||
async fn task_stats(stats: Arc<BrokerStats>) {
|
||||
let warn_duration = Duration::from_secs(10);
|
||||
let mut ticker = tokio::time::interval(warn_duration);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ticker.tick() => {
|
||||
let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
|
||||
if last_pulled == 0 {
|
||||
// no broker updates yet
|
||||
continue;
|
||||
}
|
||||
|
||||
let now = BrokerStats::now_millis();
|
||||
if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
|
||||
let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
|
||||
info!("no broker updates for some time, last update: {:?}", ts);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,6 +83,7 @@ pub struct SafeKeeperConf {
|
||||
pub walsenders_keep_horizon: bool,
|
||||
pub partial_backup_enabled: bool,
|
||||
pub partial_backup_timeout: Duration,
|
||||
pub disable_periodic_broker_push: bool,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -129,6 +130,7 @@ impl SafeKeeperConf {
|
||||
walsenders_keep_horizon: false,
|
||||
partial_backup_enabled: false,
|
||||
partial_backup_timeout: Duration::from_secs(0),
|
||||
disable_periodic_broker_push: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,6 +178,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
|
||||
walsenders_keep_horizon: false,
|
||||
partial_backup_enabled: false,
|
||||
partial_backup_timeout: Duration::from_secs(0),
|
||||
disable_periodic_broker_push: false,
|
||||
};
|
||||
|
||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
||||
|
||||
@@ -196,8 +196,13 @@ impl SubscriptionKey {
|
||||
|
||||
/// Parse from FilterTenantTimelineId
|
||||
pub fn from_proto_filter_tenant_timeline_id(
|
||||
f: &FilterTenantTimelineId,
|
||||
opt: Option<&FilterTenantTimelineId>,
|
||||
) -> Result<Self, Status> {
|
||||
if opt.is_none() {
|
||||
return Ok(SubscriptionKey::All);
|
||||
}
|
||||
|
||||
let f = opt.unwrap();
|
||||
if !f.enabled {
|
||||
return Ok(SubscriptionKey::All);
|
||||
}
|
||||
@@ -534,10 +539,7 @@ impl BrokerService for Broker {
|
||||
.remote_addr()
|
||||
.expect("TCPConnectInfo inserted by handler");
|
||||
let proto_filter = request.into_inner();
|
||||
let ttid_filter = proto_filter
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
.ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
|
||||
let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
|
||||
|
||||
let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
|
||||
let types_set = proto_filter
|
||||
|
||||
@@ -90,7 +90,11 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
|
||||
/// up on unresponsive pageservers and proceed.
|
||||
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
|
||||
/// How long a node may be unresponsive to heartbeats before we declare it offline.
|
||||
/// This must be long enough to cover node restarts as well as normal operations: in future
|
||||
/// it should be separated into distinct timeouts for startup vs. normal operation
|
||||
/// (`<https://github.com/neondatabase/neon/issues/7552>`)
|
||||
pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
|
||||
|
||||
pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
|
||||
|
||||
@@ -4251,7 +4255,9 @@ impl Service {
|
||||
/// Check all tenants for pending reconciliation work, and reconcile those in need.
|
||||
/// Additionally, reschedule tenants that require it.
|
||||
///
|
||||
/// Returns how many reconciliation tasks were started
|
||||
/// Returns how many reconciliation tasks were started, or `1` if no reconciles were
|
||||
/// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
|
||||
/// available. A return value of 0 indicates that everything is fully reconciled already.
|
||||
fn reconcile_all(&self) -> usize {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
@@ -4266,7 +4272,11 @@ impl Service {
|
||||
}
|
||||
|
||||
// Skip checking if this shard is already enqueued for reconciliation
|
||||
if shard.delayed_reconcile {
|
||||
if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
|
||||
// If there is something delayed, then return a nonzero count so that
|
||||
// callers like reconcile_all_now do not incorrectly get the impression
|
||||
// that the system is in a quiescent state.
|
||||
reconciles_spawned = std::cmp::max(1, reconciles_spawned);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -4451,7 +4461,7 @@ impl Service {
|
||||
waiter_count
|
||||
);
|
||||
|
||||
Ok(waiter_count)
|
||||
Ok(std::cmp::max(waiter_count, reconciles_spawned))
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) {
|
||||
|
||||
@@ -952,8 +952,8 @@ impl TenantShard {
|
||||
|
||||
/// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
|
||||
///
|
||||
/// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
|
||||
/// you would like to wait until one gets spawned in the background.
|
||||
/// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
|
||||
/// you would like to wait on the next reconciler that gets spawned in the background.
|
||||
pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
|
||||
self.ensure_sequence_ahead();
|
||||
|
||||
|
||||
@@ -14,10 +14,18 @@ class ComputeReconfigure:
|
||||
self.server = server
|
||||
self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
|
||||
self.workloads = {}
|
||||
self.on_notify = None
|
||||
|
||||
def register_workload(self, workload):
|
||||
self.workloads[workload.tenant_id] = workload
|
||||
|
||||
def register_on_notify(self, fn):
|
||||
"""
|
||||
Add some extra work during a notification, like sleeping to slow things down, or
|
||||
logging what was notified.
|
||||
"""
|
||||
self.on_notify = fn
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def compute_reconfigure_listener(make_httpserver):
|
||||
@@ -43,6 +51,9 @@ def compute_reconfigure_listener(make_httpserver):
|
||||
body: dict[str, Any] = request.json
|
||||
log.info(f"notify-attach request: {body}")
|
||||
|
||||
if self.on_notify is not None:
|
||||
self.on_notify(body)
|
||||
|
||||
try:
|
||||
workload = self.workloads[TenantId(body["tenant_id"])]
|
||||
except KeyError:
|
||||
|
||||
@@ -499,6 +499,7 @@ class NeonEnvBuilder:
|
||||
self.config_init_force: Optional[str] = None
|
||||
self.top_output_dir = top_output_dir
|
||||
self.control_plane_compute_hook_api: Optional[str] = None
|
||||
self.storage_controller_config: Optional[dict[Any, Any]] = None
|
||||
|
||||
self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
|
||||
|
||||
@@ -512,6 +513,11 @@ class NeonEnvBuilder:
|
||||
self.pageserver_get_impl = "vectored"
|
||||
log.debug('Overriding pageserver get_impl config to "vectored"')
|
||||
|
||||
self.pageserver_validate_vectored_get: Optional[bool] = None
|
||||
if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
|
||||
self.pageserver_validate_vectored_get = bool(validate)
|
||||
log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
), "Unexpectedly instantiated from outside a test function"
|
||||
@@ -1016,6 +1022,7 @@ class NeonEnv:
|
||||
self.pg_distrib_dir = config.pg_distrib_dir
|
||||
self.endpoint_counter = 0
|
||||
self.pageserver_config_override = config.pageserver_config_override
|
||||
self.storage_controller_config = config.storage_controller_config
|
||||
|
||||
# generate initial tenant ID here instead of letting 'neon init' generate it,
|
||||
# so that we don't need to dig it out of the config file afterwards.
|
||||
@@ -1061,6 +1068,9 @@ class NeonEnv:
|
||||
if self.control_plane_compute_hook_api is not None:
|
||||
cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
|
||||
|
||||
if self.storage_controller_config is not None:
|
||||
cfg["storage_controller"] = self.storage_controller_config
|
||||
|
||||
# Create config for pageserver
|
||||
http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
@@ -1085,6 +1095,8 @@ class NeonEnv:
|
||||
ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
|
||||
if config.pageserver_get_impl is not None:
|
||||
ps_cfg["get_impl"] = config.pageserver_get_impl
|
||||
if config.pageserver_validate_vectored_get is not None:
|
||||
ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
|
||||
|
||||
# Create a corresponding NeonPageserver object
|
||||
self.pageservers.append(
|
||||
@@ -1127,12 +1139,9 @@ class NeonEnv:
|
||||
# bounce through retries on startup
|
||||
self.storage_controller.start()
|
||||
|
||||
def storage_controller_ready():
|
||||
assert self.storage_controller.ready() is True
|
||||
|
||||
# Wait for storage controller readiness to prevent unnecessary post start-up
|
||||
# reconcile.
|
||||
wait_until(30, 1, storage_controller_ready)
|
||||
self.storage_controller.wait_until_ready()
|
||||
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
futs = []
|
||||
@@ -2036,6 +2045,15 @@ class NeonStorageController(MetricsGetter):
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected status {status} from readiness endpoint")
|
||||
|
||||
def wait_until_ready(self):
|
||||
t1 = time.time()
|
||||
|
||||
def storage_controller_ready():
|
||||
assert self.ready() is True
|
||||
|
||||
wait_until(30, 1, storage_controller_ready)
|
||||
return time.time() - t1
|
||||
|
||||
def attach_hook_issue(
|
||||
self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
|
||||
) -> int:
|
||||
@@ -2123,7 +2141,7 @@ class NeonStorageController(MetricsGetter):
|
||||
shard_count: Optional[int] = None,
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
tenant_config: Optional[Dict[Any, Any]] = None,
|
||||
placement_policy: Optional[str] = None,
|
||||
placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
|
||||
):
|
||||
"""
|
||||
Use this rather than pageserver_api() when you need to include shard parameters
|
||||
@@ -2233,10 +2251,21 @@ class NeonStorageController(MetricsGetter):
|
||||
def reconcile_until_idle(self, timeout_secs=30):
|
||||
start_at = time.time()
|
||||
n = 1
|
||||
delay_sec = 0.5
|
||||
delay_max = 5
|
||||
while n > 0:
|
||||
n = self.reconcile_all()
|
||||
if time.time() - start_at > timeout_secs:
|
||||
if n == 0:
|
||||
break
|
||||
elif time.time() - start_at > timeout_secs:
|
||||
raise RuntimeError("Timeout in reconcile_until_idle")
|
||||
else:
|
||||
# Don't call again right away: if we're waiting for many reconciles that
|
||||
# are blocked on the concurrency limit, it slows things down to call
|
||||
# reconcile_all frequently.
|
||||
time.sleep(delay_sec)
|
||||
delay_sec *= 2
|
||||
delay_sec = min(delay_sec, delay_max)
|
||||
|
||||
def consistency_check(self):
|
||||
"""
|
||||
@@ -3727,7 +3756,9 @@ class S3Scrubber:
|
||||
return stdout
|
||||
|
||||
def scan_metadata(self) -> Any:
|
||||
stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
|
||||
stdout = self.scrubber_cli(
|
||||
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
|
||||
)
|
||||
|
||||
try:
|
||||
return json.loads(stdout)
|
||||
|
||||
198
test_runner/performance/test_storage_controller_scale.py
Normal file
198
test_runner/performance/test_storage_controller_scale.py
Normal file
@@ -0,0 +1,198 @@
|
||||
import concurrent.futures
|
||||
import random
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.compute_reconfigure import ComputeReconfigure
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import TenantId, TenantShardId, TimelineId
|
||||
|
||||
|
||||
@pytest.mark.timeout(3600) # super long running test: should go down as we optimize
|
||||
def test_storage_controller_many_tenants(
|
||||
neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
|
||||
):
|
||||
"""
|
||||
Check that we cope well with a not-totally-trivial number of tenants.
|
||||
|
||||
This is checking for:
|
||||
- Obvious concurrency bugs from issuing many tenant creations/modifications
|
||||
concurrently.
|
||||
- Obvious scaling bugs like O(N^2) scaling that would be so slow that even
|
||||
a basic test starts failing from slowness.
|
||||
|
||||
This is _not_ a comprehensive scale test: just a basic sanity check that
|
||||
we don't fall over for a thousand shards.
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 5
|
||||
neon_env_builder.storage_controller_config = {
|
||||
# Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
|
||||
# TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
|
||||
# guard against regressions in restart time.
|
||||
"max_unavailable": "300s"
|
||||
}
|
||||
neon_env_builder.control_plane_compute_hook_api = (
|
||||
compute_reconfigure_listener.control_plane_compute_hook_api
|
||||
)
|
||||
|
||||
# A small sleep on each call into the notify hook, to simulate the latency of doing a database write
|
||||
compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# We will intentionally stress reconciler concurrrency, which triggers a warning when lots
|
||||
# of shards are hitting the delayed path.
|
||||
env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
|
||||
|
||||
for ps in env.pageservers:
|
||||
# This can happen because when we do a loop over all pageservers and mark them offline/active,
|
||||
# reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
|
||||
# bumping generation before other attachments are detached.
|
||||
#
|
||||
# We could clean this up by making reconcilers respect the .observed of their predecessor, if
|
||||
# we spawn with a wait for the predecessor.
|
||||
ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
|
||||
|
||||
# Storage controller is allowed to drop pageserver requests when the cancellation token
|
||||
# for a Reconciler fires.
|
||||
ps.allowed_errors.append(".*request was dropped before completing.*")
|
||||
|
||||
# Total tenants
|
||||
tenant_count = 4000
|
||||
|
||||
# Shards per tenant
|
||||
shard_count = 2
|
||||
stripe_size = 1024
|
||||
|
||||
tenants = set(TenantId.generate() for _i in range(0, tenant_count))
|
||||
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
|
||||
def check_memory():
|
||||
# Shards should be cheap_ in memory, as we will have very many of them
|
||||
expect_memory_per_shard = 128 * 1024
|
||||
|
||||
rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
|
||||
assert rss is not None
|
||||
log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
|
||||
assert rss < expect_memory_per_shard * shard_count * tenant_count
|
||||
|
||||
# We use a fixed seed to make the test somewhat reproducible: we want a randomly
|
||||
# chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
|
||||
rng = random.Random(1234)
|
||||
|
||||
# Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
|
||||
# permits, to ensure that we are exercising stressing that.
|
||||
api_concurrency = 135
|
||||
|
||||
# We will create tenants directly via API, not via neon_local, to avoid any false
|
||||
# serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
|
||||
futs = []
|
||||
t1 = time.time()
|
||||
for tenant_id in tenants:
|
||||
f = executor.submit(
|
||||
env.storage_controller.tenant_create,
|
||||
tenant_id,
|
||||
shard_count,
|
||||
stripe_size,
|
||||
placement_policy={"Attached": 1},
|
||||
)
|
||||
futs.append(f)
|
||||
|
||||
# Wait for creations to finish
|
||||
for f in futs:
|
||||
f.result()
|
||||
log.info(
|
||||
f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
|
||||
)
|
||||
|
||||
run_ops = api_concurrency * 4
|
||||
assert run_ops < len(tenants)
|
||||
op_tenants = list(tenants)[0:run_ops]
|
||||
|
||||
# Generate a mixture of operations and dispatch them all concurrently
|
||||
futs = []
|
||||
for tenant_id in op_tenants:
|
||||
op = rng.choice([0, 1, 2])
|
||||
if op == 0:
|
||||
# A fan-out write operation to all shards in a tenant (timeline creation)
|
||||
f = executor.submit(
|
||||
virtual_ps_http.timeline_create,
|
||||
PgVersion.NOT_SET,
|
||||
tenant_id,
|
||||
TimelineId.generate(),
|
||||
)
|
||||
elif op == 1:
|
||||
# A reconciler operation: migrate a shard.
|
||||
shard_number = rng.randint(0, shard_count - 1)
|
||||
tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
|
||||
dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
|
||||
f = executor.submit(
|
||||
env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
|
||||
)
|
||||
elif op == 2:
|
||||
# A passthrough read to shard zero
|
||||
f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
|
||||
|
||||
futs.append(f)
|
||||
|
||||
# Wait for mixed ops to finish
|
||||
for f in futs:
|
||||
f.result()
|
||||
|
||||
# Consistency check is safe here: all the previous operations waited for reconcile before completing
|
||||
env.storage_controller.consistency_check()
|
||||
check_memory()
|
||||
|
||||
# This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
|
||||
# how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
|
||||
# it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
|
||||
#
|
||||
# We do not require that the system is quiescent already here, although at present in this point in the test
|
||||
# that may be the case.
|
||||
while True:
|
||||
t1 = time.time()
|
||||
reconcilers = env.storage_controller.reconcile_all()
|
||||
if reconcilers == 0:
|
||||
# Time how long a no-op background reconcile takes: this measures how long it takes to
|
||||
# loop over all the shards looking for work to do.
|
||||
runtime = time.time() - t1
|
||||
log.info(f"No-op call to reconcile_all took {runtime}s")
|
||||
assert runtime < 1
|
||||
break
|
||||
|
||||
# Restart the storage controller
|
||||
env.storage_controller.stop()
|
||||
env.storage_controller.start()
|
||||
|
||||
# See how long the controller takes to pass its readiness check. This should be fast because
|
||||
# all the nodes are online: offline pageservers are the only thing that's allowed to delay
|
||||
# startup.
|
||||
readiness_period = env.storage_controller.wait_until_ready()
|
||||
assert readiness_period < 5
|
||||
|
||||
# Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
|
||||
# to run, as it was in a stable state before restart. If it did, that's a bug.
|
||||
env.storage_controller.consistency_check()
|
||||
check_memory()
|
||||
|
||||
# Restart pageservers: this exercises the /re-attach API
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
|
||||
# Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
|
||||
# as they were not offline long enough to trigger any scheduling changes.
|
||||
env.storage_controller.consistency_check()
|
||||
check_memory()
|
||||
|
||||
# Stop the storage controller before tearing down fixtures, because it otherwise might log
|
||||
# errors trying to call our `ComputeReconfigure`.
|
||||
env.storage_controller.stop()
|
||||
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
@@ -89,3 +91,102 @@ page_cache_size=10
|
||||
# was chosen empirically for this workload.
|
||||
assert non_vectored_average < 8
|
||||
assert vectored_average < 8
|
||||
|
||||
|
||||
# Stripe sizes in number of pages.
|
||||
TINY_STRIPES = 16
|
||||
LARGE_STRIPES = 32768
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)]
|
||||
)
|
||||
def test_sharding_compaction(
|
||||
neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int]
|
||||
):
|
||||
"""
|
||||
Use small stripes, small layers, and small compaction thresholds to exercise how compaction
|
||||
and image layer generation interacts with sharding.
|
||||
|
||||
We are looking for bugs that might emerge from the way sharding uses sparse layer files that
|
||||
only contain some of the keys in the key range covered by the layer, such as errors estimating
|
||||
the size of layers that might result in too-small layer files.
|
||||
"""
|
||||
|
||||
compaction_target_size = 128 * 1024
|
||||
|
||||
TENANT_CONF = {
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{compaction_target_size}",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
"pitr_interval": "0s",
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly: we want to exercise image layer creation in this test.
|
||||
"image_creation_threshold": "1",
|
||||
"image_layer_creation_check_threshold": 0,
|
||||
}
|
||||
|
||||
neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf=TENANT_CONF,
|
||||
initial_tenant_shard_count=shard_count,
|
||||
initial_tenant_shard_stripe_size=stripe_size,
|
||||
)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(64)
|
||||
for _i in range(0, 10):
|
||||
# Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
|
||||
# these should result in image layers each time we write some data into a shard, and also shards
|
||||
# recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
|
||||
# rather than asserting)
|
||||
workload.churn_rows(64)
|
||||
|
||||
# Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes
|
||||
# to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job.
|
||||
shard_has_image_layers = []
|
||||
for shard in env.storage_controller.locate(tenant_id):
|
||||
pageserver = env.get_pageserver(shard["node_id"])
|
||||
shard_id = shard["shard_id"]
|
||||
layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
|
||||
image_layer_sizes = {}
|
||||
for layer in layer_map.historic_layers:
|
||||
if layer.kind == "Image":
|
||||
image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
|
||||
|
||||
# Pageserver should assert rather than emit an empty layer file, but double check here
|
||||
assert layer.layer_file_size is not None
|
||||
assert layer.layer_file_size > 0
|
||||
|
||||
shard_has_image_layers.append(len(image_layer_sizes) > 1)
|
||||
log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}")
|
||||
|
||||
if stripe_size == TINY_STRIPES:
|
||||
# Checking the average size validates that our keyspace partitioning is properly respecting sharding: if
|
||||
# it was not, we would tend to get undersized layers because the partitioning would overestimate the physical
|
||||
# data in a keyrange.
|
||||
#
|
||||
# We only do this check with tiny stripes, because large stripes may not give all shards enough
|
||||
# data to have statistically significant image layers
|
||||
avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) # type: ignore
|
||||
log.info(f"Shard {shard_id} average image layer size: {avg_size}")
|
||||
assert avg_size > compaction_target_size / 2
|
||||
|
||||
if stripe_size == TINY_STRIPES:
|
||||
# Expect writes were scattered across all pageservers: they should all have compacted some image layers
|
||||
assert all(shard_has_image_layers)
|
||||
else:
|
||||
# With large stripes, it is expected that most of our writes went to one pageserver, so we just require
|
||||
# that at least one of them has some image layers.
|
||||
assert any(shard_has_image_layers)
|
||||
|
||||
# Assert that everything is still readable
|
||||
workload.validate()
|
||||
|
||||
@@ -228,8 +228,9 @@ def test_forward_compatibility(
|
||||
try:
|
||||
# Previous version neon_local and pageserver are not aware
|
||||
# of the new config.
|
||||
# TODO: remove this once the code reaches main
|
||||
# TODO: remove these once the previous version of neon local supports them
|
||||
neon_env_builder.pageserver_get_impl = None
|
||||
neon_env_builder.pageserver_validate_vectored_get = None
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_local_binpath = neon_env_builder.neon_binpath
|
||||
|
||||
@@ -928,6 +928,8 @@ def test_sharding_split_failures(
|
||||
".*Reconcile error: receive body: error sending request for url.*",
|
||||
# Node offline cases will fail inside reconciler when detaching secondaries
|
||||
".*Reconcile error on shard.*: receive body: error sending request for url.*",
|
||||
# Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline
|
||||
".*Reconcile error.*Cancelled.*",
|
||||
# While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
|
||||
".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
|
||||
]
|
||||
|
||||
@@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
|
||||
timeline_id = env.neon_cli.create_branch("test_idle_reconnections")
|
||||
|
||||
def collect_stats() -> Dict[str, float]:
|
||||
# we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
|
||||
@@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
collect_stats()
|
||||
|
||||
endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
|
||||
endpoint = env.endpoints.create_start("test_idle_reconnections")
|
||||
# just write something to the timeline
|
||||
endpoint.safe_psql("create table t(i int)")
|
||||
collect_stats()
|
||||
@@ -2007,3 +2007,47 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
log.info(f"dump_control_file response: {res}")
|
||||
assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
|
||||
|
||||
|
||||
# Test disables periodic pushes from safekeeper to the broker and checks that
|
||||
# pageserver can still discover safekeepers with discovery requests.
|
||||
def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch("test_broker_discovery")
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_broker_discovery",
|
||||
config_lines=["shared_buffers=1MB"],
|
||||
)
|
||||
endpoint.safe_psql("create table t(i int, payload text)")
|
||||
# Install extension containing function needed to clear buffer
|
||||
endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
|
||||
|
||||
def do_something():
|
||||
time.sleep(1)
|
||||
# generate some data to commit WAL on safekeepers
|
||||
endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
|
||||
# clear the buffers
|
||||
endpoint.safe_psql("select clear_buffer_cache()")
|
||||
# read data to fetch pages from pageserver
|
||||
endpoint.safe_psql("select sum(i) from t")
|
||||
|
||||
do_something()
|
||||
do_something()
|
||||
|
||||
for sk in env.safekeepers:
|
||||
# Disable periodic broker push, so pageserver won't be able to discover
|
||||
# safekeepers without sending a discovery request
|
||||
sk.stop().start(extra_opts=["--disable-periodic-broker-push"])
|
||||
|
||||
do_something()
|
||||
do_something()
|
||||
|
||||
# restart pageserver and check how everything works
|
||||
env.pageserver.stop().start()
|
||||
|
||||
do_something()
|
||||
do_something()
|
||||
|
||||
@@ -16,7 +16,7 @@ commands:
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml'
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
|
||||
Reference in New Issue
Block a user