mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-23 05:12:56 +00:00
Compare commits
4 Commits
release-pr
...
proxy-moka
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b9a4326fbd | ||
|
|
85033e05c9 | ||
|
|
ca578449e4 | ||
|
|
ef3a9dfafa |
21
.github/workflows/build_and_test.yml
vendored
21
.github/workflows/build_and_test.yml
vendored
@@ -236,6 +236,27 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Check Postgres submodules revision
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
|
||||
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
|
||||
|
||||
FAILED=false
|
||||
for postgres in postgres-v14 postgres-v15 postgres-v16; do
|
||||
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
|
||||
actual=$(git rev-parse "HEAD:vendor/${postgres}")
|
||||
if [ "${expected}" != "${actual}" ]; then
|
||||
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
|
||||
FAILED=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "${FAILED}" = "true" ]; then
|
||||
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
83
Cargo.lock
generated
83
Cargo.lock
generated
@@ -213,9 +213,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-lock"
|
||||
version = "3.2.0"
|
||||
version = "3.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c"
|
||||
checksum = "d034b430882f8381900d3fe6f0aaa3ad94f2cb4ac519b429692a1bc2dda4ae7b"
|
||||
dependencies = [
|
||||
"event-listener 4.0.0",
|
||||
"event-listener-strategy",
|
||||
@@ -1239,9 +1239,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "concurrent-queue"
|
||||
version = "2.3.0"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400"
|
||||
checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
@@ -1348,7 +1348,6 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
@@ -1876,6 +1875,17 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d9944b8ca13534cdfb2800775f8dd4902ff3fc75a50101466decadfdf322a24"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"parking",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener-strategy"
|
||||
version = "0.4.0"
|
||||
@@ -3122,6 +3132,30 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "moka"
|
||||
version = "0.12.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e0d88686dc561d743b40de8269b26eaf0dc58781bde087b0984646602021d08"
|
||||
dependencies = [
|
||||
"async-lock",
|
||||
"async-trait",
|
||||
"crossbeam-channel",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
"event-listener 5.3.0",
|
||||
"futures-util",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"quanta",
|
||||
"rustc_version",
|
||||
"smallvec",
|
||||
"tagptr",
|
||||
"thiserror",
|
||||
"triomphe",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "multimap"
|
||||
version = "0.8.3"
|
||||
@@ -4372,13 +4406,13 @@ dependencies = [
|
||||
"hyper 1.2.0",
|
||||
"hyper-tungstenite",
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools",
|
||||
"lasso",
|
||||
"md5",
|
||||
"measured",
|
||||
"metrics",
|
||||
"moka",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
@@ -4440,6 +4474,21 @@ dependencies = [
|
||||
"x509-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quanta"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e5167a477619228a0b284fac2674e3c388cba90631d7b7de620e6f1fcd08da5"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"raw-cpuid",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"web-sys",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.31.0"
|
||||
@@ -4551,6 +4600,15 @@ dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "raw-cpuid"
|
||||
version = "11.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e29830cbb1290e404f24c73af91c5d8d631ce7e128691e9477556b540cd01ecd"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
@@ -5992,6 +6050,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tagptr"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.40"
|
||||
@@ -6643,6 +6707,12 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "triomphe"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3"
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.4"
|
||||
@@ -7472,6 +7542,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
|
||||
@@ -99,7 +99,6 @@ humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.13.0"
|
||||
indexmap = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
|
||||
11
Makefile
11
Makefile
@@ -81,14 +81,11 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
||||
exit 1; }
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
|
||||
|
||||
VERSION=$*; \
|
||||
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
|
||||
CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
|
||||
|
||||
# nicer alias to run 'configure'
|
||||
# Note: I've been unable to use templates for this part of our configuration.
|
||||
|
||||
@@ -51,7 +51,6 @@ use tracing::{error, info, warn};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
use compute_tools::compute::{
|
||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||
@@ -70,34 +69,6 @@ use compute_tools::swap::resize_swap;
|
||||
const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let (build_tag, clap_args) = init()?;
|
||||
|
||||
let (pg_handle, start_pg_result) = {
|
||||
// Enter startup tracing context
|
||||
let _startup_context_guard = startup_context_from_env();
|
||||
|
||||
let cli_args = process_cli(&clap_args)?;
|
||||
|
||||
let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
|
||||
|
||||
let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
|
||||
|
||||
start_postgres(&clap_args, wait_spec_result)?
|
||||
|
||||
// Startup is finished, exit the startup tracing span
|
||||
};
|
||||
|
||||
// PostgreSQL is now running, if startup was successful. Wait until it exits.
|
||||
let wait_pg_result = wait_postgres(pg_handle)?;
|
||||
|
||||
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
|
||||
|
||||
maybe_delay_exit(delay_exit);
|
||||
|
||||
deinit_and_exit(wait_pg_result);
|
||||
}
|
||||
|
||||
fn init() -> Result<(String, clap::ArgMatches)> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
@@ -112,15 +83,9 @@ fn init() -> Result<(String, clap::ArgMatches)> {
|
||||
.to_string();
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
Ok((build_tag, cli().get_matches()))
|
||||
}
|
||||
|
||||
fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
let pgbin_default = "postgres";
|
||||
let pgbin = matches
|
||||
.get_one::<String>("pgbin")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(pgbin_default);
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let ext_remote_storage = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
@@ -148,30 +113,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
|
||||
|
||||
Ok(ProcessCliResult {
|
||||
connstr,
|
||||
pgdata,
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
http_port,
|
||||
spec_json,
|
||||
spec_path,
|
||||
resize_swap_on_bind,
|
||||
})
|
||||
}
|
||||
|
||||
struct ProcessCliResult<'clap> {
|
||||
connstr: &'clap str,
|
||||
pgdata: &'clap str,
|
||||
pgbin: &'clap str,
|
||||
ext_remote_storage: Option<&'clap str>,
|
||||
http_port: u16,
|
||||
spec_json: Option<&'clap String>,
|
||||
spec_path: Option<&'clap String>,
|
||||
resize_swap_on_bind: bool,
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -208,7 +149,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
@@ -218,17 +159,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
fn try_spec_from_cli(
|
||||
matches: &clap::ArgMatches,
|
||||
ProcessCliResult {
|
||||
spec_json,
|
||||
spec_path,
|
||||
..
|
||||
}: &ProcessCliResult,
|
||||
) -> Result<CliSpecParams> {
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
@@ -269,34 +201,6 @@ fn try_spec_from_cli(
|
||||
}
|
||||
};
|
||||
|
||||
Ok(CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
})
|
||||
}
|
||||
|
||||
struct CliSpecParams {
|
||||
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||
spec: Option<ComputeSpec>,
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
fn wait_spec(
|
||||
build_tag: String,
|
||||
ProcessCliResult {
|
||||
connstr,
|
||||
pgdata,
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
resize_swap_on_bind,
|
||||
http_port,
|
||||
..
|
||||
}: ProcessCliResult,
|
||||
CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
}: CliSpecParams,
|
||||
) -> Result<WaitSpecResult> {
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
@@ -335,6 +239,8 @@ fn wait_spec(
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
@@ -363,29 +269,6 @@ fn wait_spec(
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
Ok(WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
})
|
||||
}
|
||||
|
||||
struct WaitSpecResult {
|
||||
compute: Arc<ComputeNode>,
|
||||
// passed through from ProcessCliResult
|
||||
http_port: u16,
|
||||
resize_swap_on_bind: bool,
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
// need to allow unused because `matches` is only used if target_os = "linux"
|
||||
#[allow(unused_variables)] matches: &clap::ArgMatches,
|
||||
WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
}: WaitSpecResult,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.status = ComputeStatus::Init;
|
||||
@@ -435,10 +318,10 @@ fn start_postgres(
|
||||
}
|
||||
}
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
// Start Postgres
|
||||
let mut pg = None;
|
||||
let mut exit_code = None;
|
||||
|
||||
if !prestartup_failed {
|
||||
pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
@@ -493,7 +376,7 @@ fn start_postgres(
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
let vm_monitor = rt.as_ref().map(|rt| {
|
||||
let vm_monitor = &rt.as_ref().map(|rt| {
|
||||
rt.spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
@@ -506,41 +389,12 @@ fn start_postgres(
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
pg,
|
||||
StartPostgresResult {
|
||||
delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
|
||||
|
||||
struct StartPostgresResult {
|
||||
delay_exit: bool,
|
||||
// passed through from WaitSpecResult
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
rt: Option<tokio::runtime::Runtime>,
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
let mut exit_code = None;
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
// Startup is finished, exit the startup tracing span
|
||||
drop(startup_context_guard);
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
@@ -555,25 +409,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
Ok(WaitPostgresResult { exit_code })
|
||||
}
|
||||
|
||||
struct WaitPostgresResult {
|
||||
exit_code: Option<i32>,
|
||||
}
|
||||
|
||||
fn cleanup_after_postgres_exit(
|
||||
StartPostgresResult {
|
||||
mut delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
}: StartPostgresResult,
|
||||
) -> Result<bool> {
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
@@ -615,19 +450,13 @@ fn cleanup_after_postgres_exit(
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
|
||||
Ok(delay_exit)
|
||||
}
|
||||
|
||||
fn maybe_delay_exit(delay_exit: bool) {
|
||||
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
||||
// control plane can get the actual error.
|
||||
if delay_exit {
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
}
|
||||
}
|
||||
|
||||
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||
// hang for quite some time, see, for example:
|
||||
|
||||
@@ -28,7 +28,6 @@ serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
@@ -133,7 +133,7 @@ fn main() -> Result<()> {
|
||||
let subcommand_result = match sub_name {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(&env)),
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||
@@ -358,13 +358,6 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
default_conf(*num_pageservers)
|
||||
};
|
||||
|
||||
let pageserver_config: toml_edit::Document =
|
||||
if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
|
||||
std::fs::read_to_string(path)?.parse()?
|
||||
} else {
|
||||
toml_edit::Document::new()
|
||||
};
|
||||
|
||||
let pg_version = init_match
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
@@ -382,7 +375,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
// Initialize pageserver, create initial tenant and timeline.
|
||||
for ps_conf in &env.pageservers {
|
||||
PageServerNode::from_env(&env, ps_conf)
|
||||
.initialize(pageserver_config.clone())
|
||||
.initialize(&pageserver_config_overrides(init_match))
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("pageserver init failed: {e:?}");
|
||||
exit(1);
|
||||
@@ -404,6 +397,15 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
|
||||
PageServerNode::from_env(env, ps_conf)
|
||||
}
|
||||
|
||||
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
||||
init_match
|
||||
.get_many::<String>("pageserver-config-override")
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(String::as_str)
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn handle_tenant(
|
||||
tenant_match: &ArgMatches,
|
||||
env: &mut local_env::LocalEnv,
|
||||
@@ -1074,7 +1076,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1100,7 +1105,10 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver.start().await {
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1227,7 +1235,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
// Endpoints are not started automatically
|
||||
|
||||
broker::start_broker_process(env).await?;
|
||||
@@ -1244,7 +1252,10 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver.start().await {
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1385,6 +1396,13 @@ fn cli() -> Command {
|
||||
.required(false)
|
||||
.value_name("stop-mode");
|
||||
|
||||
let pageserver_config_args = Arg::new("pageserver-config-override")
|
||||
.long("pageserver-config-override")
|
||||
.num_args(1)
|
||||
.action(ArgAction::Append)
|
||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||
.required(false);
|
||||
|
||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||
.long("remote-ext-config")
|
||||
.num_args(1)
|
||||
@@ -1446,21 +1464,14 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("init")
|
||||
.about("Initialize a new Neon repository, preparing configs for services to start with")
|
||||
.arg(pageserver_config_args.clone())
|
||||
.arg(num_pageservers_arg.clone())
|
||||
.arg(
|
||||
Arg::new("config")
|
||||
.long("config")
|
||||
.required(false)
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.value_name("config")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pageserver-config")
|
||||
.long("pageserver-config")
|
||||
.required(false)
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.value_name("pageserver-config")
|
||||
.help("Merge the provided pageserver config into the one generated by neon_local."),
|
||||
.value_name("config"),
|
||||
)
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(force_arg)
|
||||
@@ -1542,6 +1553,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
@@ -1549,6 +1561,7 @@ fn cli() -> Command {
|
||||
)
|
||||
.subcommand(Command::new("restart")
|
||||
.about("Restart local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
@@ -1663,6 +1676,7 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("start")
|
||||
.about("Start page server and safekeepers")
|
||||
.arg(pageserver_config_args)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -562,10 +562,6 @@ impl LocalEnv {
|
||||
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
|
||||
}
|
||||
|
||||
for ps in &self.pageservers {
|
||||
fs::create_dir(self.pageserver_data_dir(ps.id))?;
|
||||
}
|
||||
|
||||
self.persist_config(base_path)
|
||||
}
|
||||
|
||||
|
||||
@@ -10,15 +10,14 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{
|
||||
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
|
||||
TimelineInfo,
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -74,12 +73,10 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
fn pageserver_init_make_toml(
|
||||
&self,
|
||||
cli_overrides: toml_edit::Document,
|
||||
) -> anyhow::Result<toml_edit::Document> {
|
||||
// TODO: this is a legacy code, it should be refactored to use toml_edit directly.
|
||||
|
||||
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
|
||||
///
|
||||
/// These all end up on the command line of the `pageserver` binary.
|
||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param = format!(
|
||||
"pg_distrib_dir='{}'",
|
||||
@@ -159,7 +156,10 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
if !cli_overrides.contains_key("remote_storage") {
|
||||
if !cli_overrides
|
||||
.iter()
|
||||
.any(|c| c.starts_with("remote_storage"))
|
||||
{
|
||||
overrides.push(format!(
|
||||
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
|
||||
));
|
||||
@@ -172,23 +172,14 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
// Apply the user-provided overrides
|
||||
overrides.push(cli_overrides.to_string());
|
||||
overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
|
||||
|
||||
// Turn `overrides` into a toml document.
|
||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||
let mut config_toml = toml_edit::Document::new();
|
||||
for fragment_str in overrides {
|
||||
let fragment = toml_edit::Document::from_str(&fragment_str)
|
||||
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
|
||||
for (key, item) in fragment.iter() {
|
||||
config_toml.insert(key, item.clone());
|
||||
}
|
||||
}
|
||||
Ok(config_toml)
|
||||
overrides
|
||||
}
|
||||
|
||||
/// Initializes a pageserver node by creating its config with the overrides provided.
|
||||
pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> {
|
||||
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
|
||||
self.pageserver_init(config_overrides)
|
||||
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
|
||||
}
|
||||
@@ -205,11 +196,11 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
self.start_node().await
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides).await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> {
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
let datadir = self.repo_path();
|
||||
let node_id = self.conf.id;
|
||||
println!(
|
||||
@@ -220,20 +211,36 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
|
||||
let config = self
|
||||
.pageserver_init_make_toml(cli_overrides)
|
||||
.context("make pageserver toml")?;
|
||||
let config_file_path = datadir.join("pageserver.toml");
|
||||
let mut config_file = std::fs::OpenOptions::new()
|
||||
.create_new(true)
|
||||
.write(true)
|
||||
.open(&config_file_path)
|
||||
.with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
|
||||
config_file
|
||||
.write_all(config.to_string().as_bytes())
|
||||
.context("write pageserver toml")?;
|
||||
drop(config_file);
|
||||
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
|
||||
if !datadir.exists() {
|
||||
std::fs::create_dir(&datadir)?;
|
||||
}
|
||||
|
||||
let datadir_path_str = datadir.to_str().with_context(|| {
|
||||
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
|
||||
})?;
|
||||
|
||||
// `pageserver --init` merges the `--config-override`s into a built-in default config,
|
||||
// then writes out the merged product to `pageserver.toml`.
|
||||
// TODO: just write the full `pageserver.toml` and get rid of `--config-override`.
|
||||
let mut args = vec!["--init", "--workdir", datadir_path_str];
|
||||
let overrides = self.neon_local_overrides(config_overrides);
|
||||
for piece in &overrides {
|
||||
args.push("--config-override");
|
||||
args.push(piece);
|
||||
}
|
||||
let init_output = Command::new(self.env.pageserver_bin())
|
||||
.args(args)
|
||||
.envs(self.pageserver_env_variables()?)
|
||||
.output()
|
||||
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
|
||||
|
||||
anyhow::ensure!(
|
||||
init_output.status.success(),
|
||||
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
|
||||
node_id,
|
||||
String::from_utf8_lossy(&init_output.stdout),
|
||||
String::from_utf8_lossy(&init_output.stderr),
|
||||
);
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
// the storage controller
|
||||
@@ -261,7 +268,7 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_node(&self) -> anyhow::Result<()> {
|
||||
async fn start_node(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -278,7 +285,11 @@ impl PageServerNode {
|
||||
self.conf.id, datadir,
|
||||
)
|
||||
})?;
|
||||
let args = vec!["-D", datadir_path_str];
|
||||
let mut args = vec!["-D", datadir_path_str];
|
||||
for config_override in config_overrides {
|
||||
args.push("--config-override");
|
||||
args.push(*config_override);
|
||||
}
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
&datadir,
|
||||
@@ -425,11 +436,11 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
switch_to_aux_file_v2: settings
|
||||
.remove("switch_to_aux_file_v2")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -548,11 +559,11 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
switch_to_aux_file_v2: settings
|
||||
.remove("switch_to_aux_file_v2")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -480,15 +480,6 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.remove_metric(id)
|
||||
}
|
||||
|
||||
pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
|
||||
let id = self.vec.with_labels(labels);
|
||||
let metric = self.vec.get_metric(id);
|
||||
|
||||
let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
|
||||
let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
|
||||
inc.saturating_sub(dec)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
||||
|
||||
@@ -240,7 +240,7 @@ impl<'a> ShardedRange<'a> {
|
||||
/// pages that would not actually be stored on this node.
|
||||
///
|
||||
/// Don't use this function in code that works with physical entities like layer files.
|
||||
pub fn raw_size(range: &Range<Key>) -> u32 {
|
||||
fn raw_size(range: &Range<Key>) -> u32 {
|
||||
if is_contiguous_range(range) {
|
||||
contiguous_range_len(range)
|
||||
} else {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
pub mod detach_ancestor;
|
||||
pub mod partitioning;
|
||||
pub mod utilization;
|
||||
|
||||
@@ -9,7 +8,6 @@ use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -305,31 +303,7 @@ pub struct TenantConfig {
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AuxFilePolicy {
|
||||
V1,
|
||||
V2,
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl FromStr for AuxFilePolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.to_lowercase();
|
||||
if s == "v1" {
|
||||
Ok(Self::V1)
|
||||
} else if s == "v2" {
|
||||
Ok(Self::V2)
|
||||
} else if s == "crossvalidation" || s == "cross_validation" {
|
||||
Ok(Self::CrossValidation)
|
||||
} else {
|
||||
anyhow::bail!("cannot parse {} to aux file policy", s)
|
||||
}
|
||||
}
|
||||
pub switch_to_aux_file_v2: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
use utils::id::TimelineId;
|
||||
|
||||
#[derive(Default, serde::Serialize)]
|
||||
pub struct AncestorDetached {
|
||||
pub reparented_timelines: Vec<TimelineId>,
|
||||
}
|
||||
@@ -1,4 +1,3 @@
|
||||
use bytes::{Buf, BufMut, Bytes};
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
|
||||
use tracing::warn;
|
||||
|
||||
@@ -62,84 +61,6 @@ pub fn encode_aux_file_key(path: &str) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
|
||||
|
||||
pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
|
||||
let mut ptr = val;
|
||||
if ptr.is_empty() {
|
||||
// empty value = no files
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
assert_eq!(
|
||||
ptr.get_u8(),
|
||||
AUX_FILE_ENCODING_VERSION,
|
||||
"unsupported aux file value"
|
||||
);
|
||||
let mut files = vec![];
|
||||
while ptr.has_remaining() {
|
||||
let key_len = ptr.get_u32() as usize;
|
||||
let key = &ptr[..key_len];
|
||||
ptr.advance(key_len);
|
||||
let val_len = ptr.get_u32() as usize;
|
||||
let content = &ptr[..val_len];
|
||||
ptr.advance(val_len);
|
||||
|
||||
let path = std::str::from_utf8(key)?;
|
||||
files.push((path, content));
|
||||
}
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
|
||||
/// to the original value slice. Be cautious about memory consumption.
|
||||
pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
|
||||
let mut ptr = val.clone();
|
||||
if ptr.is_empty() {
|
||||
// empty value = no files
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
assert_eq!(
|
||||
ptr.get_u8(),
|
||||
AUX_FILE_ENCODING_VERSION,
|
||||
"unsupported aux file value"
|
||||
);
|
||||
let mut files = vec![];
|
||||
while ptr.has_remaining() {
|
||||
let key_len = ptr.get_u32() as usize;
|
||||
let key = ptr.slice(..key_len);
|
||||
ptr.advance(key_len);
|
||||
let val_len = ptr.get_u32() as usize;
|
||||
let content = ptr.slice(..val_len);
|
||||
ptr.advance(val_len);
|
||||
|
||||
let path = std::str::from_utf8(&key)?.to_string();
|
||||
files.push((path, content));
|
||||
}
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
|
||||
if files.is_empty() {
|
||||
// no files = empty value
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let mut encoded = vec![];
|
||||
encoded.put_u8(AUX_FILE_ENCODING_VERSION);
|
||||
for (path, content) in files {
|
||||
if path.len() > u32::MAX as usize {
|
||||
anyhow::bail!("{} exceeds path size limit", path);
|
||||
}
|
||||
encoded.put_u32(path.len() as u32);
|
||||
encoded.put_slice(path.as_bytes());
|
||||
if content.len() > u32::MAX as usize {
|
||||
anyhow::bail!("{} exceeds content size limit", path);
|
||||
}
|
||||
encoded.put_u32(content.len() as u32);
|
||||
encoded.put_slice(content);
|
||||
}
|
||||
Ok(encoded)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -188,21 +109,4 @@ mod tests {
|
||||
encode_aux_file_key("other_file_not_supported").to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_value_encoding() {
|
||||
let files = vec![
|
||||
("pg_logical/1.file", "1111".as_bytes()),
|
||||
("pg_logical/2.file", "2222".as_bytes()),
|
||||
];
|
||||
assert_eq!(
|
||||
files,
|
||||
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
|
||||
);
|
||||
let files = vec![];
|
||||
assert_eq!(
|
||||
files,
|
||||
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -540,12 +540,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.secondary_tenant
|
||||
.evict_layer(
|
||||
tenant_manager.get_conf(),
|
||||
layer.timeline_id,
|
||||
layer.name,
|
||||
layer.metadata,
|
||||
)
|
||||
.evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
|
||||
.await;
|
||||
Ok(file_size)
|
||||
});
|
||||
|
||||
@@ -63,7 +63,6 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::SpawnMode;
|
||||
@@ -1229,15 +1228,13 @@ async fn layer_download_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let layer_name = LayerFileName::from_str(layer_file_name)
|
||||
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let downloaded = timeline
|
||||
.download_layer(&layer_name)
|
||||
.download_layer(layer_file_name)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1261,14 +1258,11 @@ async fn evict_timeline_layer_handler(
|
||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let layer_name = LayerFileName::from_str(layer_file_name)
|
||||
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let evicted = timeline
|
||||
.evict_layer(&layer_name)
|
||||
.evict_layer(layer_file_name)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1833,75 +1827,6 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
json_response(StatusCode::OK, info)
|
||||
}
|
||||
|
||||
async fn timeline_detach_ancestor_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use crate::tenant::timeline::detach_ancestor::Options;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
|
||||
|
||||
async move {
|
||||
let mut options = Options::default();
|
||||
|
||||
let rewrite_concurrency =
|
||||
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
|
||||
let copy_concurrency =
|
||||
parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?;
|
||||
|
||||
[
|
||||
(&mut options.rewrite_concurrency, rewrite_concurrency),
|
||||
(&mut options.copy_concurrency, copy_concurrency),
|
||||
]
|
||||
.into_iter()
|
||||
.filter_map(|(target, val)| val.map(|val| (target, val)))
|
||||
.for_each(|(target, val)| *target = val);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let (_guard, prepared) = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = state
|
||||
.tenant_manager
|
||||
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(reparented_timelines) => {
|
||||
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
|
||||
reparented_timelines,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
Err(e) => Err(ApiError::InternalServerError(
|
||||
e.context("timeline detach completion"),
|
||||
)),
|
||||
}
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn deletion_queue_flush(
|
||||
r: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
@@ -2590,10 +2515,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
|
||||
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor",
|
||||
|r| api_handler(r, timeline_detach_ancestor_handler),
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_delete_handler)
|
||||
})
|
||||
|
||||
@@ -2929,8 +2929,6 @@ pub fn preinitialize_metrics() {
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
|
||||
&REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
&REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
|
||||
@@ -10,9 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::repository::*;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
@@ -24,7 +24,6 @@ use pageserver_api::key::{
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -671,7 +670,7 @@ impl Timeline {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
async fn list_aux_files_v1(
|
||||
pub(crate) async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -689,63 +688,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_aux_files_v2(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
|
||||
.await
|
||||
.context("scan")?;
|
||||
let mut result = HashMap::new();
|
||||
for (_, v) in kv {
|
||||
let v = v.context("get value")?;
|
||||
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
|
||||
for (fname, content) in v {
|
||||
result.insert(fname, content);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get_switch_aux_file_policy() {
|
||||
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
|
||||
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
|
||||
AuxFilePolicy::CrossValidation => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
|
||||
match (v1_result, v2_result) {
|
||||
(Ok(v1), Ok(v2)) => {
|
||||
if v1 != v2 {
|
||||
tracing::error!(
|
||||
"unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
|
||||
);
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"unmatched aux file v1 v2 result"
|
||||
)));
|
||||
}
|
||||
Ok(v1)
|
||||
}
|
||||
(Ok(_), Err(v2)) => {
|
||||
tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
|
||||
Err(v2)
|
||||
}
|
||||
(Err(v1), Ok(_)) => {
|
||||
tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
|
||||
Err(v1)
|
||||
}
|
||||
(Err(_), Err(v2)) => Err(v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used to initialize the logical size tracking on startup.
|
||||
///
|
||||
@@ -1447,9 +1389,6 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
|
||||
if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
|
||||
return Ok(());
|
||||
}
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
@@ -1465,122 +1404,90 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let policy = self.tline.get_switch_aux_file_policy();
|
||||
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
Ok(val) => Some(val),
|
||||
Err(PageReconstructError::MissingKey(_)) => None,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
let new_files = if content.is_empty() {
|
||||
files
|
||||
.into_iter()
|
||||
.filter(|(p, _)| &path != p)
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
files
|
||||
.into_iter()
|
||||
.filter(|(p, _)| &path != p)
|
||||
.chain(std::iter::once((path, content)))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
}
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
aux_files.n_deltas = 0;
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
|
||||
);
|
||||
aux_files.n_deltas += 1;
|
||||
}
|
||||
aux_files.dir = Some(dir);
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
n_files = dir.files.len();
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
|
||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||
Err(
|
||||
PageReconstructError::Other(_)
|
||||
| PageReconstructError::WalRedo(_)
|
||||
| PageReconstructError::MissingKey { .. },
|
||||
) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
aux_files.n_deltas = 0;
|
||||
} else {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
|
||||
);
|
||||
aux_files.n_deltas += 1;
|
||||
}
|
||||
aux_files.dir = Some(dir);
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
n_files = dir.files.len();
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
|
||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||
Err(
|
||||
PageReconstructError::Other(_)
|
||||
| PageReconstructError::WalRedo(_)
|
||||
| PageReconstructError::MissingKey { .. },
|
||||
) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
n_files = 1;
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
n_files = 1;
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, n_files));
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, n_files));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum InvalidInput {
|
||||
TooShortValue,
|
||||
@@ -41,8 +42,10 @@ pub(crate) enum InvalidInput {
|
||||
|
||||
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||
/// use this type for querying if a slice looks some particular way.
|
||||
#[cfg(test)]
|
||||
pub(crate) struct ValueBytes;
|
||||
|
||||
#[cfg(test)]
|
||||
impl ValueBytes {
|
||||
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
|
||||
@@ -370,8 +370,6 @@ pub enum TaskKind {
|
||||
|
||||
#[cfg(test)]
|
||||
UnitTest,
|
||||
|
||||
DetachAncestor,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -322,9 +322,6 @@ pub struct Tenant {
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) timeline_get_throttle:
|
||||
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
|
||||
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
|
||||
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Tenant {
|
||||
@@ -2560,7 +2557,6 @@ impl Tenant {
|
||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3758,7 +3754,7 @@ pub(crate) mod harness {
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
|
||||
switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
@@ -371,9 +370,9 @@ pub struct TenantConf {
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
pub switch_aux_file_policy: AuxFilePolicy,
|
||||
pub switch_to_aux_file_v2: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -472,7 +471,7 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
pub switch_to_aux_file_v2: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -530,9 +529,9 @@ impl TenantConfOpt {
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
switch_aux_file_policy: self
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(global_conf.switch_aux_file_policy),
|
||||
switch_to_aux_file_v2: self
|
||||
.switch_to_aux_file_v2
|
||||
.unwrap_or(global_conf.switch_to_aux_file_v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -574,7 +573,7 @@ impl Default for TenantConf {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: AuxFilePolicy::V1,
|
||||
switch_to_aux_file_v2: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -649,7 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
switch_aux_file_policy: value.switch_aux_file_policy,
|
||||
switch_to_aux_file_v2: value.switch_to_aux_file_v2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,24 +207,6 @@ impl TimelineMetadata {
|
||||
self.body.ancestor_lsn
|
||||
}
|
||||
|
||||
/// When reparenting, the `ancestor_lsn` does not change.
|
||||
pub fn reparent(&mut self, timeline: &TimelineId) {
|
||||
assert!(self.body.ancestor_timeline.is_some());
|
||||
// no assertion for redoing this: it's fine, we may have to repeat this multiple times over
|
||||
self.body.ancestor_timeline = Some(*timeline);
|
||||
}
|
||||
|
||||
pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
|
||||
if let Some(ancestor) = self.body.ancestor_timeline {
|
||||
assert_eq!(ancestor, *timeline);
|
||||
}
|
||||
if self.body.ancestor_lsn != Lsn(0) {
|
||||
assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
|
||||
}
|
||||
self.body.ancestor_timeline = None;
|
||||
self.body.ancestor_lsn = Lsn(0);
|
||||
}
|
||||
|
||||
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
|
||||
self.body.latest_gc_cutoff_lsn
|
||||
}
|
||||
|
||||
@@ -56,7 +56,6 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||
use super::TenantSharedResources;
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
@@ -2008,101 +2007,6 @@ impl TenantManager {
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Completes an earlier prepared timeline detach ancestor.
|
||||
pub(crate) async fn complete_detaching_timeline_ancestor(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
prepared: PreparedTimelineDetach,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
struct RevertOnDropSlot(Option<SlotGuard>);
|
||||
|
||||
impl Drop for RevertOnDropSlot {
|
||||
fn drop(&mut self) {
|
||||
if let Some(taken) = self.0.take() {
|
||||
taken.revert();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RevertOnDropSlot {
|
||||
fn into_inner(mut self) -> SlotGuard {
|
||||
self.0.take().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for RevertOnDropSlot {
|
||||
type Target = SlotGuard;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
let slot_guard = RevertOnDropSlot(Some(slot_guard));
|
||||
|
||||
let tenant = {
|
||||
let Some(old_slot) = slot_guard.get_old_value() else {
|
||||
anyhow::bail!(
|
||||
"Tenant not found when trying to complete detaching timeline ancestor"
|
||||
);
|
||||
};
|
||||
|
||||
let Some(tenant) = old_slot.get_attached() else {
|
||||
anyhow::bail!("Tenant is not in attached state");
|
||||
};
|
||||
|
||||
if !tenant.is_active() {
|
||||
anyhow::bail!("Tenant is not active");
|
||||
}
|
||||
|
||||
tenant.clone()
|
||||
};
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let reparented = timeline
|
||||
.complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
|
||||
.await?;
|
||||
|
||||
let mut slot_guard = slot_guard.into_inner();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
// this really should not happen, at all, unless shutdown was already going?
|
||||
anyhow::bail!("Cannot restart Tenant, already shutting down");
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -570,7 +570,7 @@ impl RemoteTimelineClient {
|
||||
// ahead of what's _actually_ on the remote during index upload.
|
||||
upload_queue.latest_metadata = metadata.clone();
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -591,7 +591,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
upload_queue.latest_metadata.apply(update);
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -611,14 +611,18 @@ impl RemoteTimelineClient {
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background (internal function)
|
||||
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
fn schedule_index_upload(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
metadata: TimelineMetadata,
|
||||
) {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
info!(
|
||||
@@ -627,7 +631,11 @@ impl RemoteTimelineClient {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
);
|
||||
|
||||
let index_part = IndexPart::from(&*upload_queue);
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
@@ -637,61 +645,9 @@ impl RemoteTimelineClient {
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
}
|
||||
|
||||
pub(crate) async fn schedule_reparenting_and_wait(
|
||||
self: &Arc<Self>,
|
||||
new_parent: &TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
|
||||
// and reads the in-memory part we cannot do the detaching like this
|
||||
let receiver = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue.latest_metadata.reparent(new_parent);
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
|
||||
self.schedule_barrier0(upload_queue)
|
||||
};
|
||||
|
||||
Self::wait_completion0(receiver).await
|
||||
}
|
||||
|
||||
/// Schedules uploading a new version of `index_part.json` with the given layers added,
|
||||
/// detaching from ancestor and waits for it to complete.
|
||||
///
|
||||
/// This is used with `Timeline::detach_ancestor` functionality.
|
||||
pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait(
|
||||
self: &Arc<Self>,
|
||||
layers: &[Layer],
|
||||
adopted: (TimelineId, Lsn),
|
||||
) -> anyhow::Result<()> {
|
||||
let barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue
|
||||
.latest_metadata
|
||||
.detach_from_ancestor(&adopted.0, &adopted.1);
|
||||
|
||||
for layer in layers {
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(layer.layer_desc().filename(), layer.metadata());
|
||||
}
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
|
||||
let barrier = self.schedule_barrier0(upload_queue);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
barrier
|
||||
};
|
||||
|
||||
Self::wait_completion0(barrier).await
|
||||
}
|
||||
|
||||
/// Launch an upload operation in the background; the file is added to be included in next
|
||||
/// `index_part.json` upload.
|
||||
/// Launch an upload operation in the background.
|
||||
///
|
||||
pub(crate) fn schedule_layer_file_upload(
|
||||
self: &Arc<Self>,
|
||||
layer: ResidentLayer,
|
||||
@@ -717,11 +673,9 @@ impl RemoteTimelineClient {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
info!(
|
||||
gen=?metadata.generation,
|
||||
shard=?metadata.shard,
|
||||
"scheduled layer file upload {layer}",
|
||||
"scheduled layer file upload {layer} gen={:?} shard={:?}",
|
||||
metadata.generation, metadata.shard
|
||||
);
|
||||
|
||||
let op = UploadOp::UploadLayer(layer, metadata);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
@@ -784,6 +738,10 @@ impl RemoteTimelineClient {
|
||||
where
|
||||
I: IntoIterator<Item = LayerFileName>,
|
||||
{
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
|
||||
// Decorate our list of names with each name's metadata, dropping
|
||||
// names that are unexpectedly missing from our metadata. This metadata
|
||||
// is later used when physically deleting layers, to construct key paths.
|
||||
@@ -822,7 +780,7 @@ impl RemoteTimelineClient {
|
||||
// index_part update, because that needs to be uploaded before we can actually delete the
|
||||
// files.
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, metadata);
|
||||
}
|
||||
|
||||
with_metadata
|
||||
@@ -924,18 +882,12 @@ impl RemoteTimelineClient {
|
||||
|
||||
/// Wait for all previously scheduled uploads/deletions to complete
|
||||
pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
let receiver = {
|
||||
let mut receiver = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
self.schedule_barrier0(upload_queue)
|
||||
};
|
||||
|
||||
Self::wait_completion0(receiver).await
|
||||
}
|
||||
|
||||
async fn wait_completion0(
|
||||
mut receiver: tokio::sync::watch::Receiver<()>,
|
||||
) -> anyhow::Result<()> {
|
||||
if receiver.changed().await.is_err() {
|
||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||
}
|
||||
@@ -1051,7 +1003,8 @@ impl RemoteTimelineClient {
|
||||
let deleted_at = Utc::now().naive_utc();
|
||||
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
|
||||
|
||||
let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
|
||||
let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
|
||||
.context("IndexPart serialize")?;
|
||||
index_part.deleted_at = Some(deleted_at);
|
||||
index_part
|
||||
};
|
||||
@@ -1132,93 +1085,6 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload.
|
||||
///
|
||||
/// This is not normally needed.
|
||||
pub(crate) async fn upload_layer_file(
|
||||
self: &Arc<Self>,
|
||||
uploaded: &ResidentLayer,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.tenant_shard_id.to_index(),
|
||||
&uploaded.layer_desc().filename(),
|
||||
uploaded.metadata().generation,
|
||||
);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::upload_timeline_layer(
|
||||
&self.storage_impl,
|
||||
uploaded.local_path(),
|
||||
&remote_path,
|
||||
uploaded.metadata().file_size(),
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"upload a layer without adding it to latest files",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("upload a layer without adding it to latest files")
|
||||
}
|
||||
|
||||
/// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is
|
||||
/// not added to be part of a future `index_part.json` upload.
|
||||
pub(crate) async fn copy_timeline_layer(
|
||||
self: &Arc<Self>,
|
||||
adopted: &Layer,
|
||||
adopted_as: &Layer,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let source_remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&adopted
|
||||
.get_timeline_id()
|
||||
.expect("Source timeline should be alive"),
|
||||
self.tenant_shard_id.to_index(),
|
||||
&adopted.layer_desc().filename(),
|
||||
adopted.metadata().generation,
|
||||
);
|
||||
|
||||
let target_remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.tenant_shard_id.to_index(),
|
||||
&adopted_as.layer_desc().filename(),
|
||||
adopted_as.metadata().generation,
|
||||
);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::copy_timeline_layer(
|
||||
&self.storage_impl,
|
||||
&source_remote_path,
|
||||
&target_remote_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"copy timeline layer",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remote copy timeline layer")
|
||||
}
|
||||
|
||||
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
|
||||
match tokio::time::timeout(
|
||||
DELETION_QUEUE_FLUSH_TIMEOUT,
|
||||
@@ -1390,7 +1256,7 @@ impl RemoteTimelineClient {
|
||||
while let Some(next_op) = upload_queue.queued_operations.front() {
|
||||
// Can we run this task now?
|
||||
let can_run_now = match next_op {
|
||||
UploadOp::UploadLayer(..) => {
|
||||
UploadOp::UploadLayer(_, _) => {
|
||||
// Can always be scheduled.
|
||||
true
|
||||
}
|
||||
@@ -1517,25 +1383,13 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
|
||||
let local_path = layer.local_path();
|
||||
|
||||
// We should only be uploading layers created by this `Tenant`'s lifetime, so
|
||||
// the metadata in the upload should always match our current generation.
|
||||
assert_eq!(layer_metadata.generation, self.generation);
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
layer_metadata.shard,
|
||||
&layer.layer_desc().filename(),
|
||||
layer_metadata.generation,
|
||||
);
|
||||
|
||||
let path = layer.local_path();
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
local_path,
|
||||
&remote_path,
|
||||
layer_metadata.file_size(),
|
||||
path,
|
||||
layer_metadata,
|
||||
self.generation,
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1964,6 +1818,29 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Files on the remote storage are stored with paths, relative to the workdir.
|
||||
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
|
||||
///
|
||||
/// Errors if the path provided does not start from pageserver's workdir.
|
||||
pub fn remote_path(
|
||||
conf: &PageServerConf,
|
||||
local_path: &Utf8Path,
|
||||
generation: Generation,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let stripped = local_path
|
||||
.strip_prefix(&conf.workdir)
|
||||
.context("Failed to strip workdir prefix")?;
|
||||
|
||||
let suffixed = format!("{0}{1}", stripped, generation.get_suffix());
|
||||
|
||||
RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
|
||||
format!(
|
||||
"to resolve remote part of path {:?} for base {:?}",
|
||||
local_path, conf.workdir
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1971,7 +1848,6 @@ mod tests {
|
||||
context::RequestContext,
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::layer::local_layer_path,
|
||||
Tenant, Timeline,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
@@ -2154,20 +2030,11 @@ mod tests {
|
||||
]
|
||||
.into_iter()
|
||||
.map(|(name, contents): (LayerFileName, Vec<u8>)| {
|
||||
|
||||
let local_path = local_layer_path(
|
||||
harness.conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&name,
|
||||
&generation,
|
||||
);
|
||||
std::fs::write(&local_path, &contents).unwrap();
|
||||
std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
|
||||
|
||||
Layer::for_resident(
|
||||
harness.conf,
|
||||
&timeline,
|
||||
local_path,
|
||||
name,
|
||||
LayerFileMetadata::new(contents.len() as u64, generation, shard),
|
||||
)
|
||||
@@ -2304,22 +2171,19 @@ mod tests {
|
||||
..
|
||||
} = TestSetup::new("metrics").await.unwrap();
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let local_path = local_layer_path(
|
||||
harness.conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer_file_name_1,
|
||||
&harness.generation,
|
||||
);
|
||||
let content_1 = dummy_contents("foo");
|
||||
std::fs::write(&local_path, &content_1).unwrap();
|
||||
std::fs::write(
|
||||
timeline_path.join(layer_file_name_1.file_name()),
|
||||
&content_1,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let layer_file_1 = Layer::for_resident(
|
||||
harness.conf,
|
||||
&timeline,
|
||||
local_path,
|
||||
layer_file_name_1.clone(),
|
||||
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
|
||||
);
|
||||
@@ -2388,7 +2252,12 @@ mod tests {
|
||||
|
||||
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
|
||||
// An empty IndexPart, just sufficient to ensure deserialization will succeed
|
||||
let example_index_part = IndexPart::example();
|
||||
let example_metadata = TimelineMetadata::example();
|
||||
let example_index_part = IndexPart::new(
|
||||
HashMap::new(),
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
);
|
||||
|
||||
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
@@ -56,13 +55,7 @@ pub async fn download_layer_file<'a>(
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
layer_file_name,
|
||||
&layer_metadata.generation,
|
||||
);
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::collections::HashMap;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::SerializeError;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -103,14 +104,15 @@ impl IndexPart {
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
fn new(
|
||||
layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
|
||||
pub fn new(
|
||||
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata: TimelineMetadata,
|
||||
) -> Self {
|
||||
// Transform LayerFileMetadata into IndexLayerMetadata
|
||||
let layer_metadata = layers_and_metadata
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, IndexLayerMetadata::from(v)))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
@@ -139,24 +141,20 @@ impl IndexPart {
|
||||
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||
serde_json::to_vec(self)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn example() -> Self {
|
||||
let example_metadata = TimelineMetadata::example();
|
||||
Self::new(
|
||||
&HashMap::new(),
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&UploadQueueInitialized> for IndexPart {
|
||||
fn from(uq: &UploadQueueInitialized) -> Self {
|
||||
let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
|
||||
let metadata = uq.latest_metadata.clone();
|
||||
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||
type Error = SerializeError;
|
||||
|
||||
Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
|
||||
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
|
||||
Ok(Self::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,8 +172,8 @@ pub struct IndexLayerMetadata {
|
||||
pub shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl From<&LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: &LayerFileMetadata) -> Self {
|
||||
impl From<LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: LayerFileMetadata) -> Self {
|
||||
IndexLayerMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
|
||||
@@ -12,13 +12,18 @@ use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
|
||||
use super::Generation;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path,
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path,
|
||||
},
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
|
||||
use remote_storage::{GenericRemoteStorage, TimeTravelError};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::LayerFileMetadata;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
@@ -60,10 +65,11 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
local_path: &'a Utf8Path,
|
||||
remote_path: &'a RemotePath,
|
||||
metadata_size: u64,
|
||||
source_path: &'a Utf8Path,
|
||||
known_metadata: &'a LayerFileMetadata,
|
||||
generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-layer", |_| {
|
||||
@@ -72,7 +78,8 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
pausable_failpoint!("before-upload-layer-pausable");
|
||||
|
||||
let source_file_res = fs::File::open(&local_path).await;
|
||||
let storage_path = remote_path(conf, source_path, generation)?;
|
||||
let source_file_res = fs::File::open(&source_path).await;
|
||||
let source_file = match source_file_res {
|
||||
Ok(source_file) => source_file,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
@@ -83,49 +90,34 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
// it has been written to disk yet.
|
||||
//
|
||||
// This is tested against `test_compaction_delete_before_upload`
|
||||
info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?,
|
||||
Err(e) => {
|
||||
Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
|
||||
}
|
||||
};
|
||||
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| format!("get the source file metadata for layer {local_path:?}"))?
|
||||
.with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
|
||||
.len();
|
||||
|
||||
let metadata_size = known_metadata.file_size();
|
||||
if metadata_size != fs_size {
|
||||
bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
}
|
||||
|
||||
let fs_size = usize::try_from(fs_size)
|
||||
.with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
|
||||
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
|
||||
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload(reader, fs_size, remote_path, None, cancel)
|
||||
.upload(reader, fs_size, &storage_path, None, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{local_path}'"))
|
||||
}
|
||||
|
||||
pub(super) async fn copy_timeline_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
source_path: &RemotePath,
|
||||
target_path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-copy-layer", |_| {
|
||||
bail!("failpoint before-copy-layer")
|
||||
});
|
||||
|
||||
pausable_failpoint!("before-copy-layer-pausable");
|
||||
|
||||
storage
|
||||
.copy_object(source_path, target_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("copy layer {source_path} to {target_path}"))
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))
|
||||
}
|
||||
|
||||
/// Uploads the given `initdb` data to the remote storage.
|
||||
|
||||
@@ -21,9 +21,8 @@ use self::{
|
||||
use super::{
|
||||
config::{SecondaryLocationConfig, TenantConfOpt},
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::{layer::local_layer_path, LayerFileName},
|
||||
storage_layer::LayerFileName,
|
||||
};
|
||||
|
||||
use pageserver_api::{
|
||||
@@ -183,7 +182,6 @@ impl SecondaryTenant {
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
name: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -197,13 +195,9 @@ impl SecondaryTenant {
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&self.tenant_shard_id,
|
||||
&timeline_id,
|
||||
&name,
|
||||
&metadata.generation,
|
||||
);
|
||||
let path = conf
|
||||
.timeline_path(&self.tenant_shard_id, &timeline_id)
|
||||
.join(name.file_name());
|
||||
|
||||
let this = self.clone();
|
||||
|
||||
@@ -214,7 +208,7 @@ impl SecondaryTenant {
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
let deleted = std::fs::remove_file(local_path);
|
||||
let deleted = std::fs::remove_file(path);
|
||||
|
||||
let not_found = deleted
|
||||
.as_ref()
|
||||
|
||||
@@ -22,7 +22,7 @@ use crate::{
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
},
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::{layer::local_layer_path, LayerFileName},
|
||||
storage_layer::LayerFileName,
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
},
|
||||
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
|
||||
@@ -621,12 +621,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| (&l.name, l.metadata.generation))
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| (l.0, l.1.metadata.generation))
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let mut layer_count = layers_on_disk.len();
|
||||
@@ -637,24 +637,16 @@ impl<'a> TenantDownloader<'a> {
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
layer_count -= 1;
|
||||
layer_byte_count -= timeline_state
|
||||
.on_disk_layers
|
||||
.get(layer_file_name)
|
||||
.get(layer)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
self.secondary_state.get_tenant_shard_id(),
|
||||
timeline_id,
|
||||
layer_file_name,
|
||||
generation,
|
||||
);
|
||||
|
||||
delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path));
|
||||
delete_layers.push((*timeline_id, (*layer).clone()));
|
||||
}
|
||||
|
||||
progress.bytes_downloaded += layer_byte_count;
|
||||
@@ -669,7 +661,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
// Execute accumulated deletions
|
||||
for (timeline_id, layer_name, local_path) in delete_layers {
|
||||
for (timeline_id, layer_name) in delete_layers {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
let local_path = timeline_path.join(layer_name.to_string());
|
||||
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
@@ -758,6 +754,9 @@ impl<'a> TenantDownloader<'a> {
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(tenant_shard_id, &timeline.timeline_id);
|
||||
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
@@ -807,14 +806,10 @@ impl<'a> TenantDownloader<'a> {
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
let local_path = self
|
||||
.conf
|
||||
.timeline_path(tenant_shard_id, &timeline.timeline_id)
|
||||
.join(layer.name.file_name());
|
||||
match tokio::fs::metadata(&local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
@@ -908,13 +903,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
};
|
||||
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
let local_path = timeline_path.join(layer.name.to_string());
|
||||
|
||||
tracing::warn!(
|
||||
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
|
||||
|
||||
@@ -1139,15 +1139,15 @@ impl DeltaLayerInner {
|
||||
Ok(all_keys)
|
||||
}
|
||||
|
||||
/// Using the given writer, write out a version which has the earlier Lsns than `until`.
|
||||
///
|
||||
/// Return the amount of key value records pushed to the writer.
|
||||
/// Using the given writer, write out a truncated version, where LSNs higher than the
|
||||
/// truncate_at are missing.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn copy_prefix(
|
||||
&self,
|
||||
writer: &mut DeltaLayerWriter,
|
||||
until: Lsn,
|
||||
truncate_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
) -> anyhow::Result<()> {
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
|
||||
};
|
||||
@@ -1211,8 +1211,6 @@ impl DeltaLayerInner {
|
||||
// FIXME: buffering of DeltaLayerWriter
|
||||
let mut per_blob_copy = Vec::new();
|
||||
|
||||
let mut records = 0;
|
||||
|
||||
while let Some(item) = stream.try_next().await? {
|
||||
tracing::debug!(?item, "popped");
|
||||
let offset = item
|
||||
@@ -1231,7 +1229,7 @@ impl DeltaLayerInner {
|
||||
|
||||
prev = Option::from(item);
|
||||
|
||||
let actionable = actionable.filter(|x| x.0.lsn < until);
|
||||
let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
|
||||
|
||||
let builder = if let Some((meta, offsets)) = actionable {
|
||||
// extend or create a new builder
|
||||
@@ -1299,7 +1297,7 @@ impl DeltaLayerInner {
|
||||
let will_init = crate::repository::ValueBytes::will_init(data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
||||
@@ -1316,10 +1314,7 @@ impl DeltaLayerInner {
|
||||
)
|
||||
.await;
|
||||
per_blob_copy = tmp;
|
||||
|
||||
res?;
|
||||
|
||||
records += 1;
|
||||
}
|
||||
|
||||
buffer = Some(res.buf);
|
||||
@@ -1331,7 +1326,7 @@ impl DeltaLayerInner {
|
||||
"with the sentinel above loop should had handled all"
|
||||
);
|
||||
|
||||
Ok(records)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
@@ -1404,6 +1399,7 @@ impl DeltaLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn stream_index_forwards<'a, R>(
|
||||
&'a self,
|
||||
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||
|
||||
@@ -2,13 +2,11 @@
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use crate::repository::Key;
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::str::FromStr;
|
||||
|
||||
use regex::Regex;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::PersistentLayerDesc;
|
||||
@@ -76,19 +74,10 @@ impl DeltaFileName {
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_start_str = lsn_parts.next()?;
|
||||
let lsn_end_str = lsn_parts.next()?;
|
||||
|
||||
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36
|
||||
|| key_end_str.len() != 36
|
||||
|| lsn_start_str.len() != 16
|
||||
|| lsn_end_str.len() != 16
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
@@ -193,10 +182,6 @@ impl ImageFileName {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
@@ -274,22 +259,9 @@ impl From<DeltaFileName> for LayerFileName {
|
||||
impl FromStr for LayerFileName {
|
||||
type Err = String;
|
||||
|
||||
/// Conversion from either a physical layer filename, or the string-ization of
|
||||
/// Self. When loading a physical layer filename, we drop any extra information
|
||||
/// not needed to build Self.
|
||||
fn from_str(value: &str) -> Result<Self, Self::Err> {
|
||||
let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
|
||||
let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
|
||||
Some(captures) => captures
|
||||
.name("base")
|
||||
.expect("Non-optional group")
|
||||
.as_str()
|
||||
.into(),
|
||||
None => value.into(),
|
||||
};
|
||||
|
||||
let delta = DeltaFileName::parse_str(&file_name);
|
||||
let image = ImageFileName::parse_str(&file_name);
|
||||
let delta = DeltaFileName::parse_str(value);
|
||||
let image = ImageFileName::parse_str(value);
|
||||
let ok = match (delta, image) {
|
||||
(None, None) => {
|
||||
return Err(format!(
|
||||
@@ -343,42 +315,3 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
|
||||
v.parse().map_err(|e| E::custom(e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn image_layer_parse() -> anyhow::Result<()> {
|
||||
let expected = LayerFileName::Image(ImageFileName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
|
||||
});
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_layer_parse() -> anyhow::Result<()> {
|
||||
let expected = LayerFileName::Delta(DeltaFileName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
|
||||
..Lsn::from_hex("000000000154C481").unwrap(),
|
||||
});
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,13 +4,12 @@ use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
@@ -124,25 +123,6 @@ impl PartialEq for Layer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn local_layer_path(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
layer_file_name: &LayerFileName,
|
||||
_generation: &Generation,
|
||||
) -> Utf8PathBuf {
|
||||
let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
|
||||
|
||||
timeline_path.join(layer_file_name.file_name())
|
||||
|
||||
// TOOD: include generation in the name in now+1 releases.
|
||||
// timeline_path.join(format!(
|
||||
// "{}{}",
|
||||
// layer_file_name.file_name(),
|
||||
// generation.get_suffix()
|
||||
// ))
|
||||
}
|
||||
|
||||
impl Layer {
|
||||
/// Creates a layer value for a file we know to not be resident.
|
||||
pub(crate) fn for_evicted(
|
||||
@@ -151,14 +131,6 @@ impl Layer {
|
||||
file_name: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
) -> Self {
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&file_name,
|
||||
&metadata.generation,
|
||||
);
|
||||
|
||||
let desc = PersistentLayerDesc::from_filename(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
@@ -171,7 +143,6 @@ impl Layer {
|
||||
let owner = Layer(Arc::new(LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
None,
|
||||
@@ -188,7 +159,6 @@ impl Layer {
|
||||
pub(crate) fn for_resident(
|
||||
conf: &'static PageServerConf,
|
||||
timeline: &Arc<Timeline>,
|
||||
local_path: Utf8PathBuf,
|
||||
file_name: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
) -> ResidentLayer {
|
||||
@@ -214,7 +184,6 @@ impl Layer {
|
||||
LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
Some(inner),
|
||||
@@ -256,19 +225,9 @@ impl Layer {
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&desc.filename(),
|
||||
&timeline.generation,
|
||||
);
|
||||
|
||||
LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
Some(inner),
|
||||
@@ -451,13 +410,6 @@ impl Layer {
|
||||
self.0.metadata()
|
||||
}
|
||||
|
||||
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
|
||||
self.0
|
||||
.timeline
|
||||
.upgrade()
|
||||
.map(|timeline| timeline.timeline_id)
|
||||
}
|
||||
|
||||
/// Traditional debug dumping facility
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
@@ -757,17 +709,19 @@ impl Drop for LayerInner {
|
||||
}
|
||||
|
||||
impl LayerInner {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline: &Arc<Timeline>,
|
||||
local_path: Utf8PathBuf,
|
||||
access_stats: LayerAccessStats,
|
||||
desc: PersistentLayerDesc,
|
||||
downloaded: Option<Arc<DownloadedLayer>>,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Self {
|
||||
let path = conf
|
||||
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
|
||||
.join(desc.filename().to_string());
|
||||
|
||||
let (inner, version, init_status) = if let Some(inner) = downloaded {
|
||||
let version = inner.version;
|
||||
let resident = ResidentOrWantedEvicted::Resident(inner);
|
||||
@@ -783,7 +737,7 @@ impl LayerInner {
|
||||
LayerInner {
|
||||
conf,
|
||||
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
|
||||
path: local_path,
|
||||
path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
have_remote_client: timeline.remote_client.is_some(),
|
||||
@@ -1843,23 +1797,25 @@ impl ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the amount of keys and values written to the writer.
|
||||
pub(crate) async fn copy_delta_prefix(
|
||||
/// FIXME: truncate is bad name because we are not truncating anything, but copying the
|
||||
/// filtered parts.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn copy_delta_prefix(
|
||||
&self,
|
||||
writer: &mut super::delta_layer::DeltaLayerWriter,
|
||||
until: Lsn,
|
||||
truncate_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
) -> anyhow::Result<()> {
|
||||
use LayerKind::*;
|
||||
|
||||
let owner = &self.owner.0;
|
||||
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
Delta(ref d) => d
|
||||
.copy_prefix(writer, until, ctx)
|
||||
.copy_prefix(writer, truncate_at, ctx)
|
||||
.await
|
||||
.with_context(|| format!("copy_delta_prefix until {until} of {self}")),
|
||||
Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
|
||||
.with_context(|| format!("truncate {self}")),
|
||||
Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
mod compaction;
|
||||
pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
mod eviction_task;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
@@ -23,9 +22,8 @@ use pageserver_api::{
|
||||
},
|
||||
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
|
||||
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
|
||||
TimelineState,
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, ShardNumber, TenantShardId},
|
||||
@@ -60,7 +58,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
@@ -865,13 +862,9 @@ impl Timeline {
|
||||
// Initialise the reconstruct state for the key with the cache
|
||||
// entry returned above.
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
|
||||
// Only add the cached image to the reconstruct state when it exists.
|
||||
if cached_page_img.is_some() {
|
||||
let mut key_state = VectoredValueReconstructState::default();
|
||||
key_state.img = cached_page_img;
|
||||
reconstruct_state.keys.insert(key, Ok(key_state));
|
||||
}
|
||||
let mut key_state = VectoredValueReconstructState::default();
|
||||
key_state.img = cached_page_img;
|
||||
reconstruct_state.keys.insert(key, Ok(key_state));
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
@@ -1083,7 +1076,7 @@ impl Timeline {
|
||||
// We should generalize this into Keyspace::contains in the future.
|
||||
for range in &keyspace.ranges {
|
||||
if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
|
||||
|| range.end.field1 > METADATA_KEY_END_PREFIX
|
||||
|| range.end.field1 >= METADATA_KEY_END_PREFIX
|
||||
{
|
||||
return Err(GetVectoredError::Other(anyhow::anyhow!(
|
||||
"only metadata keyspace can be scanned"
|
||||
@@ -1501,12 +1494,6 @@ impl Timeline {
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
self.freeze_and_flush0().await
|
||||
}
|
||||
|
||||
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
|
||||
// polluting the span hierarchy.
|
||||
pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
|
||||
let to_lsn = self.freeze_inmem_layer(false).await;
|
||||
self.flush_frozen_layers_and_wait(to_lsn).await
|
||||
}
|
||||
@@ -1905,7 +1892,7 @@ impl Timeline {
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
pub(crate) async fn download_layer(
|
||||
&self,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_file_name: &str,
|
||||
) -> anyhow::Result<Option<bool>> {
|
||||
let Some(layer) = self.find_layer(layer_file_name).await else {
|
||||
return Ok(None);
|
||||
@@ -1923,10 +1910,7 @@ impl Timeline {
|
||||
/// Evict just one layer.
|
||||
///
|
||||
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
|
||||
pub(crate) async fn evict_layer(
|
||||
&self,
|
||||
layer_file_name: &LayerFileName,
|
||||
) -> anyhow::Result<Option<bool>> {
|
||||
pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
let _gate = self
|
||||
.gate
|
||||
.enter()
|
||||
@@ -2000,12 +1984,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
||||
|
||||
// Private functions
|
||||
impl Timeline {
|
||||
pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
|
||||
.switch_to_aux_file_v2
|
||||
.unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
|
||||
}
|
||||
|
||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||
@@ -2417,8 +2402,8 @@ impl Timeline {
|
||||
|
||||
for discovered in discovered {
|
||||
let (name, kind) = match discovered {
|
||||
Discovered::Layer(layer_file_name, local_path, file_size) => {
|
||||
discovered_layers.push((layer_file_name, local_path, file_size));
|
||||
Discovered::Layer(file_name, file_size) => {
|
||||
discovered_layers.push((file_name, file_size));
|
||||
continue;
|
||||
}
|
||||
Discovered::Metadata => {
|
||||
@@ -2463,7 +2448,7 @@ impl Timeline {
|
||||
let mut needs_cleanup = Vec::new();
|
||||
let mut total_physical_size = 0;
|
||||
|
||||
for (name, local_path, decision) in decided {
|
||||
for (name, decision) in decided {
|
||||
let decision = match decision {
|
||||
Ok(UseRemote { local, remote }) => {
|
||||
// Remote is authoritative, but we may still choose to retain
|
||||
@@ -2473,23 +2458,26 @@ impl Timeline {
|
||||
// the correct generation.
|
||||
UseLocal(remote)
|
||||
} else {
|
||||
let local_path = local_path.as_ref().expect("Locally found layer must have path");
|
||||
init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
|
||||
path.pop();
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(DismissedLayer::Future { local }) => {
|
||||
if local.is_some() {
|
||||
let local_path = local_path.expect("Locally found layer must have path");
|
||||
init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
|
||||
path.push(name.file_name());
|
||||
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
|
||||
path.pop();
|
||||
}
|
||||
needs_cleanup.push(name);
|
||||
continue;
|
||||
}
|
||||
Err(DismissedLayer::LocalOnly(local)) => {
|
||||
let local_path = local_path.expect("Locally found layer must have path");
|
||||
init::cleanup_local_only_file(&local_path, &name, &local)?;
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_only_file(&path, &name, &local)?;
|
||||
path.pop();
|
||||
// this file never existed remotely, we will have to do rework
|
||||
continue;
|
||||
}
|
||||
@@ -2505,18 +2493,7 @@ impl Timeline {
|
||||
let layer = match decision {
|
||||
UseLocal(m) => {
|
||||
total_physical_size += m.file_size();
|
||||
|
||||
let local_path = local_path.unwrap_or_else(|| {
|
||||
local_layer_path(
|
||||
conf,
|
||||
&this.tenant_shard_id,
|
||||
&this.timeline_id,
|
||||
&name,
|
||||
&m.generation,
|
||||
)
|
||||
});
|
||||
|
||||
Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
|
||||
Layer::for_resident(conf, &this, name, m).drop_eviction_guard()
|
||||
}
|
||||
Evicted(remote) | UseRemote { remote, .. } => {
|
||||
Layer::for_evicted(conf, &this, name, remote)
|
||||
@@ -2997,11 +2974,11 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
|
||||
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
|
||||
let guard = self.layers.read().await;
|
||||
for historic_layer in guard.layer_map().iter_historic_layers() {
|
||||
let historic_layer_name = historic_layer.filename();
|
||||
if layer_name == &historic_layer_name {
|
||||
let historic_layer_name = historic_layer.filename().file_name();
|
||||
if layer_file_name == historic_layer_name {
|
||||
return Some(guard.get_from_desc(&historic_layer));
|
||||
}
|
||||
}
|
||||
@@ -3031,7 +3008,7 @@ impl Timeline {
|
||||
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().filename(),
|
||||
(&layer.metadata()).into(),
|
||||
layer.metadata().into(),
|
||||
last_activity_ts,
|
||||
)
|
||||
});
|
||||
@@ -3533,7 +3510,7 @@ impl Timeline {
|
||||
Ok(ancestor)
|
||||
}
|
||||
|
||||
pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
|
||||
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
|
||||
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
|
||||
format!(
|
||||
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
|
||||
@@ -4349,49 +4326,6 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => {}
|
||||
)
|
||||
}
|
||||
|
||||
/// Detach this timeline from its ancestor by copying all of ancestors layers as this
|
||||
/// Timelines layers up to the ancestor_lsn.
|
||||
///
|
||||
/// Requires a timeline that:
|
||||
/// - has an ancestor to detach from
|
||||
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
|
||||
/// a technical requirement
|
||||
/// - has prev_lsn in remote storage (temporary restriction)
|
||||
///
|
||||
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
|
||||
/// polled again until completion.
|
||||
///
|
||||
/// During the operation all timelines sharing the data with this timeline will be reparented
|
||||
/// from our ancestor to be branches of this timeline.
|
||||
pub(crate) async fn prepare_to_detach_from_ancestor(
|
||||
self: &Arc<Timeline>,
|
||||
tenant: &crate::tenant::Tenant,
|
||||
options: detach_ancestor::Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<
|
||||
(
|
||||
completion::Completion,
|
||||
detach_ancestor::PreparedTimelineDetach,
|
||||
),
|
||||
detach_ancestor::Error,
|
||||
> {
|
||||
detach_ancestor::prepare(self, tenant, options, ctx).await
|
||||
}
|
||||
|
||||
/// Completes the ancestor detach. This method is to be called while holding the
|
||||
/// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
|
||||
/// timeline be deleted. After this method returns successfully, tenant must be reloaded.
|
||||
///
|
||||
/// Pageserver receiving a SIGKILL during this operation is not supported (yet).
|
||||
pub(crate) async fn complete_detaching_timeline_ancestor(
|
||||
self: &Arc<Timeline>,
|
||||
tenant: &crate::tenant::Tenant,
|
||||
prepared: detach_ancestor::PreparedTimelineDetach,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
detach_ancestor::complete(self, tenant, prepared, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
@@ -4500,24 +4434,6 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn rewrite_layers(
|
||||
self: &Arc<Self>,
|
||||
replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
drop_layers: Vec<Layer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
|
||||
|
||||
let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
|
||||
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Schedules the uploads of the given image layers
|
||||
fn upload_new_image_layers(
|
||||
self: &Arc<Self>,
|
||||
@@ -4676,8 +4592,6 @@ impl Timeline {
|
||||
retain_lsns: Vec<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
|
||||
|
||||
let now = SystemTime::now();
|
||||
let mut result: GcResult = GcResult::default();
|
||||
|
||||
|
||||
@@ -15,8 +15,7 @@ use anyhow::{anyhow, Context};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
@@ -94,7 +93,7 @@ impl Timeline {
|
||||
// Define partitioning schema if needed
|
||||
|
||||
// FIXME: the match should only cover repartitioning, not the next steps
|
||||
let partition_count = match self
|
||||
match self
|
||||
.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
self.get_compaction_target_size(),
|
||||
@@ -147,7 +146,6 @@ impl Timeline {
|
||||
assert!(sparse_layers.is_empty());
|
||||
|
||||
self.upload_new_image_layers(dense_layers)?;
|
||||
dense_partitioning.parts.len()
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -159,150 +157,9 @@ impl Timeline {
|
||||
if !self.cancel.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
1
|
||||
}
|
||||
};
|
||||
|
||||
if self.shard_identity.count >= ShardCount::new(2) {
|
||||
// Limit the number of layer rewrites to the number of partitions: this means its
|
||||
// runtime should be comparable to a full round of image layer creations, rather than
|
||||
// being potentially much longer.
|
||||
let rewrite_max = partition_count;
|
||||
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check for layers that are elegible to be rewritten:
|
||||
/// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
|
||||
/// we don't indefinitely retain keys in this shard that aren't needed.
|
||||
/// - For future use: layers beyond pitr_interval that are in formats we would
|
||||
/// rather not maintain compatibility with indefinitely.
|
||||
///
|
||||
/// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
|
||||
/// how much work it will try to do in each compaction pass.
|
||||
async fn compact_shard_ancestors(
|
||||
self: &Arc<Self>,
|
||||
rewrite_max: usize,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut drop_layers = Vec::new();
|
||||
let layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
|
||||
// We will use the PITR cutoff as a condition for rewriting layers.
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map().iter_historic_layers() {
|
||||
let layer = layers.get_from_desc(&layer_desc);
|
||||
if layer.metadata().shard.shard_count == self.shard_identity.count {
|
||||
// This layer does not belong to a historic ancestor, no need to re-image it.
|
||||
continue;
|
||||
}
|
||||
|
||||
// This layer was created on an ancestor shard: check if it contains any data for this shard.
|
||||
let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity);
|
||||
let layer_local_page_count = sharded_range.page_count();
|
||||
let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range());
|
||||
if layer_local_page_count == 0 {
|
||||
// This ancestral layer only covers keys that belong to other shards.
|
||||
// We include the full metadata in the log: if we had some critical bug that caused
|
||||
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard.",
|
||||
);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
|
||||
// wrong. If ShardedRange claims the local page count is zero, then no keys in this layer
|
||||
// should be !is_key_disposable()
|
||||
let range = layer_desc.get_key_range();
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
debug_assert!(self.shard_identity.is_key_disposable(&key));
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
drop_layers.push(layer);
|
||||
continue;
|
||||
} else if layer_local_page_count != u32::MAX
|
||||
&& layer_local_page_count == layer_raw_page_count
|
||||
{
|
||||
debug!(%layer,
|
||||
"layer is entirely shard local ({} keys), no need to filter it",
|
||||
layer_local_page_count
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't bother re-writing a layer unless it will at least halve its size
|
||||
if layer_local_page_count != u32::MAX
|
||||
&& layer_local_page_count > layer_raw_page_count / 2
|
||||
{
|
||||
debug!(%layer,
|
||||
"layer is already mostly local ({}/{}), not rewriting",
|
||||
layer_local_page_count,
|
||||
layer_raw_page_count
|
||||
);
|
||||
}
|
||||
|
||||
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
|
||||
// without incurring the I/O cost of a rewrite.
|
||||
if layer_desc.get_lsn_range().end >= pitr_cutoff {
|
||||
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
|
||||
layer_desc.get_lsn_range().end, pitr_cutoff);
|
||||
continue;
|
||||
}
|
||||
|
||||
if layer_desc.is_delta() {
|
||||
// We do not yet implement rewrite of delta layers
|
||||
debug!(%layer, "Skipping rewrite of delta layer");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only rewrite layers if they would have different remote paths: either they belong to this
|
||||
// shard but an old generation, or they belonged to another shard. This also implicitly
|
||||
// guarantees that the layer is persistent in remote storage (as only remote persistent
|
||||
// layers are carried across shard splits, any local-only layer would be in the current generation)
|
||||
if layer.metadata().generation == self.generation
|
||||
&& layer.metadata().shard.shard_count == self.shard_identity.count
|
||||
{
|
||||
debug!(%layer, "Skipping rewrite, is not from old generation");
|
||||
continue;
|
||||
}
|
||||
|
||||
if layers_to_rewrite.len() >= rewrite_max {
|
||||
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
layers_to_rewrite.len()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Fall through: all our conditions for doing a rewrite passed.
|
||||
// TODO: implement rewriting
|
||||
tracing::debug!(%layer, "Would rewrite layer");
|
||||
}
|
||||
|
||||
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
|
||||
drop(layers);
|
||||
|
||||
// TODO: collect layers to rewrite
|
||||
let replace_layers = Vec::new();
|
||||
|
||||
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
|
||||
self.rewrite_layers(replace_layers, drop_layers).await?;
|
||||
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
// We wait for all uploads to complete before finishing this compaction stage. This is not
|
||||
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
|
||||
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
|
||||
// load.
|
||||
remote_client.wait_completion().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -422,10 +422,6 @@ impl DeleteTimelineFlow {
|
||||
pub(crate) fn is_finished(&self) -> bool {
|
||||
matches!(self, Self::Finished)
|
||||
}
|
||||
|
||||
pub(crate) fn is_not_started(&self) -> bool {
|
||||
matches!(self, Self::NotStarted)
|
||||
}
|
||||
}
|
||||
|
||||
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
|
||||
|
||||
@@ -1,550 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{layer_manager::LayerManager, Timeline};
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
tenant::{
|
||||
storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
|
||||
Tenant,
|
||||
},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum Error {
|
||||
#[error("no ancestors")]
|
||||
NoAncestor,
|
||||
#[error("too many ancestors")]
|
||||
TooManyAncestors,
|
||||
#[error("shutting down, please retry later")]
|
||||
ShuttingDown,
|
||||
#[error("detached timeline must receive writes before the operation")]
|
||||
DetachedTimelineNeedsWrites,
|
||||
#[error("flushing failed")]
|
||||
FlushAncestor(#[source] anyhow::Error),
|
||||
#[error("layer download failed")]
|
||||
RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
|
||||
#[error("copying LSN prefix locally failed")]
|
||||
CopyDeltaPrefix(#[source] anyhow::Error),
|
||||
#[error("upload rewritten layer")]
|
||||
UploadRewritten(#[source] anyhow::Error),
|
||||
|
||||
#[error("ancestor is already being detached by: {}", .0)]
|
||||
OtherTimelineDetachOngoing(TimelineId),
|
||||
|
||||
#[error("remote copying layer failed")]
|
||||
CopyFailed(#[source] anyhow::Error),
|
||||
|
||||
#[error("unexpected error")]
|
||||
Unexpected(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
pub(crate) struct PreparedTimelineDetach {
|
||||
layers: Vec<Layer>,
|
||||
}
|
||||
|
||||
/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Options {
|
||||
pub(crate) rewrite_concurrency: std::num::NonZeroUsize,
|
||||
pub(crate) copy_concurrency: std::num::NonZeroUsize,
|
||||
}
|
||||
|
||||
impl Default for Options {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
|
||||
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Timeline::prepare_to_detach_from_ancestor`]
|
||||
pub(super) async fn prepare(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
options: Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
|
||||
use Error::*;
|
||||
|
||||
if detached.remote_client.as_ref().is_none() {
|
||||
unimplemented!("no new code for running without remote storage");
|
||||
}
|
||||
|
||||
let Some((ancestor, ancestor_lsn)) = detached
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|tl| (tl.clone(), detached.ancestor_lsn))
|
||||
else {
|
||||
return Err(NoAncestor);
|
||||
};
|
||||
|
||||
if !ancestor_lsn.is_valid() {
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
|
||||
if ancestor.ancestor_timeline.is_some() {
|
||||
// non-technical requirement; we could flatten N ancestors just as easily but we chose
|
||||
// not to
|
||||
return Err(TooManyAncestors);
|
||||
}
|
||||
|
||||
if detached.get_prev_record_lsn() == Lsn::INVALID
|
||||
|| detached.disk_consistent_lsn.load() == ancestor_lsn
|
||||
{
|
||||
// this is to avoid a problem that after detaching we would be unable to start up the
|
||||
// compute because of "PREV_LSN: invalid".
|
||||
return Err(DetachedTimelineNeedsWrites);
|
||||
}
|
||||
|
||||
// before we acquire the gate, we must mark the ancestor as having a detach operation
|
||||
// ongoing which will block other concurrent detach operations so we don't get to ackward
|
||||
// situations where there would be two branches trying to reparent earlier branches.
|
||||
let (guard, barrier) = completion::channel();
|
||||
|
||||
{
|
||||
let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
|
||||
if let Some((tl, other)) = guard.as_ref() {
|
||||
if !other.is_ready() {
|
||||
return Err(OtherTimelineDetachOngoing(*tl));
|
||||
}
|
||||
}
|
||||
*guard = Some((detached.timeline_id, barrier));
|
||||
}
|
||||
|
||||
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
|
||||
|
||||
if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
|
||||
let span =
|
||||
tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
|
||||
async {
|
||||
let started_at = std::time::Instant::now();
|
||||
let freeze_and_flush = ancestor.freeze_and_flush0();
|
||||
let mut freeze_and_flush = std::pin::pin!(freeze_and_flush);
|
||||
|
||||
let res =
|
||||
tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush)
|
||||
.await;
|
||||
|
||||
let res = match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => {
|
||||
tracing::info!("freezing and flushing ancestor is still ongoing");
|
||||
freeze_and_flush.await
|
||||
}
|
||||
};
|
||||
|
||||
res.map_err(FlushAncestor)?;
|
||||
|
||||
// we do not need to wait for uploads to complete but we do need `struct Layer`,
|
||||
// copying delta prefix is unsupported currently for `InMemoryLayer`.
|
||||
tracing::info!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"froze and flushed the ancestor"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
.instrument(span)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let end_lsn = ancestor_lsn + 1;
|
||||
|
||||
let (filtered_layers, straddling_branchpoint, rest_of_historic) = {
|
||||
// we do not need to start from our layers, because they can only be layers that come
|
||||
// *after* ancestor_lsn
|
||||
let layers = tokio::select! {
|
||||
guard = ancestor.layers.read() => guard,
|
||||
_ = detached.cancel.cancelled() => {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
_ = ancestor.cancel.cancelled() => {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
};
|
||||
|
||||
// between retries, these can change if compaction or gc ran in between. this will mean
|
||||
// we have to redo work.
|
||||
partition_work(ancestor_lsn, &layers)
|
||||
};
|
||||
|
||||
// TODO: layers are already sorted by something: use that to determine how much of remote
|
||||
// copies are already done.
|
||||
tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
|
||||
|
||||
// TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
|
||||
let mut new_layers: Vec<Layer> =
|
||||
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
|
||||
|
||||
{
|
||||
tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
let mut wrote_any = false;
|
||||
|
||||
let limiter = Arc::new(tokio::sync::Semaphore::new(
|
||||
options.rewrite_concurrency.get(),
|
||||
));
|
||||
|
||||
for layer in straddling_branchpoint {
|
||||
let limiter = limiter.clone();
|
||||
let timeline = detached.clone();
|
||||
let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
|
||||
tasks.spawn(async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let copied =
|
||||
upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
|
||||
.await?;
|
||||
Ok(copied)
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Ok(Some(copied))) => {
|
||||
wrote_any = true;
|
||||
tracing::info!(layer=%copied, "rewrote and uploaded");
|
||||
new_layers.push(copied);
|
||||
}
|
||||
Ok(Ok(None)) => {}
|
||||
Ok(Err(e)) => return Err(e),
|
||||
Err(je) => return Err(Unexpected(je.into())),
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: the fsync should be mandatory, after both rewrites and copies
|
||||
if wrote_any {
|
||||
let timeline_dir = VirtualFile::open(
|
||||
&detached
|
||||
.conf
|
||||
.timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
|
||||
)
|
||||
.await
|
||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
||||
timeline_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
}
|
||||
}
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
|
||||
|
||||
for adopted in rest_of_historic {
|
||||
let limiter = limiter.clone();
|
||||
let timeline = detached.clone();
|
||||
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let owned =
|
||||
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
|
||||
tracing::info!(layer=%owned, "remote copied");
|
||||
Ok(owned)
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Ok(owned)) => {
|
||||
new_layers.push(owned);
|
||||
}
|
||||
Ok(Err(failed)) => {
|
||||
return Err(failed);
|
||||
}
|
||||
Err(je) => return Err(Unexpected(je.into())),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fsync directory again if we hardlinked something
|
||||
|
||||
let prepared = PreparedTimelineDetach { layers: new_layers };
|
||||
|
||||
Ok((guard, prepared))
|
||||
}
|
||||
|
||||
fn partition_work(
|
||||
ancestor_lsn: Lsn,
|
||||
source_layermap: &LayerManager,
|
||||
) -> (usize, Vec<Layer>, Vec<Layer>) {
|
||||
let mut straddling_branchpoint = vec![];
|
||||
let mut rest_of_historic = vec![];
|
||||
|
||||
let mut later_by_lsn = 0;
|
||||
|
||||
for desc in source_layermap.layer_map().iter_historic_layers() {
|
||||
// off by one chances here:
|
||||
// - start is inclusive
|
||||
// - end is exclusive
|
||||
if desc.lsn_range.start > ancestor_lsn {
|
||||
later_by_lsn += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let target = if desc.lsn_range.start <= ancestor_lsn
|
||||
&& desc.lsn_range.end > ancestor_lsn
|
||||
&& desc.is_delta
|
||||
{
|
||||
// TODO: image layer at Lsn optimization
|
||||
&mut straddling_branchpoint
|
||||
} else {
|
||||
&mut rest_of_historic
|
||||
};
|
||||
|
||||
target.push(source_layermap.get_from_desc(&desc));
|
||||
}
|
||||
|
||||
(later_by_lsn, straddling_branchpoint, rest_of_historic)
|
||||
}
|
||||
|
||||
async fn upload_rewritten_layer(
|
||||
end_lsn: Lsn,
|
||||
layer: &Layer,
|
||||
target: &Arc<Timeline>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Layer>, Error> {
|
||||
use Error::UploadRewritten;
|
||||
let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?;
|
||||
|
||||
let Some(copied) = copied else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
// FIXME: better shuttingdown error
|
||||
target
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.upload_layer_file(&copied, cancel)
|
||||
.await
|
||||
.map_err(UploadRewritten)?;
|
||||
|
||||
Ok(Some(copied.into()))
|
||||
}
|
||||
|
||||
async fn copy_lsn_prefix(
|
||||
end_lsn: Lsn,
|
||||
layer: &Layer,
|
||||
target_timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<ResidentLayer>, Error> {
|
||||
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
|
||||
|
||||
tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
|
||||
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
target_timeline.conf,
|
||||
target_timeline.timeline_id,
|
||||
target_timeline.tenant_shard_id,
|
||||
layer.layer_desc().key_range.start,
|
||||
layer.layer_desc().lsn_range.start..end_lsn,
|
||||
)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
let resident = layer
|
||||
.download_and_keep_resident()
|
||||
.await
|
||||
// likely shutdown
|
||||
.map_err(RewrittenDeltaDownloadFailed)?;
|
||||
|
||||
let records = resident
|
||||
.copy_delta_prefix(&mut writer, end_lsn, ctx)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
drop(resident);
|
||||
|
||||
tracing::debug!(%layer, records, "copied records");
|
||||
|
||||
if records == 0 {
|
||||
drop(writer);
|
||||
// TODO: we might want to store an empty marker in remote storage for this
|
||||
// layer so that we will not needlessly walk `layer` on repeated attempts.
|
||||
Ok(None)
|
||||
} else {
|
||||
// reuse the key instead of adding more holes between layers by using the real
|
||||
// highest key in the layer.
|
||||
let reused_highest_key = layer.layer_desc().key_range.end;
|
||||
let copied = writer
|
||||
.finish(reused_highest_key, target_timeline, ctx)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
tracing::debug!(%layer, %copied, "new layer produced");
|
||||
|
||||
Ok(Some(copied))
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote
|
||||
/// storage on successful return without the adopted layer being added to `index_part.json`.
|
||||
async fn remote_copy(
|
||||
adopted: &Layer,
|
||||
adoptee: &Arc<Timeline>,
|
||||
generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Layer, Error> {
|
||||
use Error::CopyFailed;
|
||||
|
||||
// depending if Layer::keep_resident we could hardlink
|
||||
|
||||
let mut metadata = adopted.metadata();
|
||||
debug_assert!(metadata.generation <= generation);
|
||||
metadata.generation = generation;
|
||||
|
||||
let owned = crate::tenant::storage_layer::Layer::for_evicted(
|
||||
adoptee.conf,
|
||||
adoptee,
|
||||
adopted.layer_desc().filename(),
|
||||
metadata,
|
||||
);
|
||||
|
||||
// FIXME: better shuttingdown error
|
||||
adoptee
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.copy_timeline_layer(adopted, &owned, cancel)
|
||||
.await
|
||||
.map(move |()| owned)
|
||||
.map_err(CopyFailed)
|
||||
}
|
||||
|
||||
/// See [`Timeline::complete_detaching_timeline_ancestor`].
|
||||
pub(super) async fn complete(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
prepared: PreparedTimelineDetach,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
let rtc = detached
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.expect("has to have a remote timeline client for timeline ancestor detach");
|
||||
|
||||
let PreparedTimelineDetach { layers } = prepared;
|
||||
|
||||
let ancestor = detached
|
||||
.get_ancestor_timeline()
|
||||
.expect("must still have a ancestor");
|
||||
let ancestor_lsn = detached.get_ancestor_lsn();
|
||||
|
||||
// publish the prepared layers before we reparent any of the timelines, so that on restart
|
||||
// reparented timelines find layers. also do the actual detaching.
|
||||
//
|
||||
// if we crash after this operation, we will at least come up having detached a timeline, but
|
||||
// we cannot go back and reparent the timelines which would had been reparented in normal
|
||||
// execution.
|
||||
//
|
||||
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
|
||||
// which could give us a completely wrong layer combination.
|
||||
rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
|
||||
&layers,
|
||||
(ancestor.timeline_id, ancestor_lsn),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
// because we are now keeping the slot in progress, it is unlikely that there will be any
|
||||
// timeline deletions during this time. if we raced one, then we'll just ignore it.
|
||||
tenant
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter_map(|tl| {
|
||||
if Arc::ptr_eq(tl, detached) {
|
||||
return None;
|
||||
}
|
||||
|
||||
if !tl.is_active() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let tl_ancestor = tl.ancestor_timeline.as_ref()?;
|
||||
let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
|
||||
let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
|
||||
|
||||
let is_deleting = tl
|
||||
.delete_progress
|
||||
.try_lock()
|
||||
.map(|flow| !flow.is_not_started())
|
||||
.unwrap_or(true);
|
||||
|
||||
if is_same && is_earlier && !is_deleting {
|
||||
Some(tl.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.for_each(|timeline| {
|
||||
// important in this scope: we are holding the Tenant::timelines lock
|
||||
let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
|
||||
let new_parent = detached.timeline_id;
|
||||
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let res = timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.expect("reparented has to have remote client because detached has one")
|
||||
.schedule_reparenting_and_wait(&new_parent)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(()) => Some(timeline),
|
||||
Err(e) => {
|
||||
// with the use of tenant slot, we no longer expect these.
|
||||
tracing::warn!("reparenting failed: {e:#}");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(span),
|
||||
);
|
||||
});
|
||||
|
||||
let reparenting_candidates = tasks.len();
|
||||
let mut reparented = Vec::with_capacity(tasks.len());
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Some(timeline)) => {
|
||||
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
|
||||
reparented.push(timeline.timeline_id);
|
||||
}
|
||||
Ok(None) => {
|
||||
// lets just ignore this for now. one or all reparented timelines could had
|
||||
// started deletion, and that is fine.
|
||||
}
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => {
|
||||
// ignore; it's better to continue with a single reparenting failing (or even
|
||||
// all of them) in order to get to the goal state.
|
||||
//
|
||||
// these timelines will never be reparentable, but they can be always detached as
|
||||
// separate tree roots.
|
||||
}
|
||||
Err(je) => tracing::error!("unexpected join error: {je:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
if reparenting_candidates != reparented.len() {
|
||||
tracing::info!("failed to reparent some candidates");
|
||||
}
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
METADATA_FILE_NAME,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino::Utf8Path;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
|
||||
/// Identified files in the timeline directory.
|
||||
pub(super) enum Discovered {
|
||||
/// The only one we care about
|
||||
Layer(LayerFileName, Utf8PathBuf, u64),
|
||||
Layer(LayerFileName, u64),
|
||||
/// Old ephmeral files from previous launches, should be removed
|
||||
Ephemeral(String),
|
||||
/// Old temporary timeline files, unsure what these really are, should be removed
|
||||
@@ -46,7 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
|
||||
let discovered = match LayerFileName::from_str(&file_name) {
|
||||
Ok(file_name) => {
|
||||
let file_size = direntry.metadata()?.len();
|
||||
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
|
||||
Discovered::Layer(file_name, file_size)
|
||||
}
|
||||
Err(_) => {
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
@@ -104,38 +104,26 @@ pub(super) enum DismissedLayer {
|
||||
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
pub(super) fn reconcile(
|
||||
discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
|
||||
discovered: Vec<(LayerFileName, u64)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Vec<(
|
||||
LayerFileName,
|
||||
Option<Utf8PathBuf>,
|
||||
Result<Decision, DismissedLayer>,
|
||||
)> {
|
||||
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
|
||||
use Decision::*;
|
||||
|
||||
// name => (local_path, local_metadata, remote_metadata)
|
||||
type Collected = HashMap<
|
||||
LayerFileName,
|
||||
(
|
||||
Option<Utf8PathBuf>,
|
||||
Option<LayerFileMetadata>,
|
||||
Option<LayerFileMetadata>,
|
||||
),
|
||||
>;
|
||||
// name => (local, remote)
|
||||
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
|
||||
|
||||
let mut discovered = discovered
|
||||
.into_iter()
|
||||
.map(|(layer_name, local_path, file_size)| {
|
||||
.map(|(name, file_size)| {
|
||||
(
|
||||
layer_name,
|
||||
name,
|
||||
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
|
||||
// it is not in IndexPart, in which case using our current generation makes sense
|
||||
// because it will be uploaded in this generation.
|
||||
(
|
||||
Some(local_path),
|
||||
Some(LayerFileMetadata::new(file_size, generation, shard)),
|
||||
None,
|
||||
),
|
||||
@@ -152,15 +140,15 @@ pub(super) fn reconcile(
|
||||
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
|
||||
.for_each(|(name, metadata)| {
|
||||
if let Some(existing) = discovered.get_mut(name) {
|
||||
existing.2 = Some(metadata);
|
||||
existing.1 = Some(metadata);
|
||||
} else {
|
||||
discovered.insert(name.to_owned(), (None, None, Some(metadata)));
|
||||
discovered.insert(name.to_owned(), (None, Some(metadata)));
|
||||
}
|
||||
});
|
||||
|
||||
discovered
|
||||
.into_iter()
|
||||
.map(|(name, (local_path, local, remote))| {
|
||||
.map(|(name, (local, remote))| {
|
||||
let decision = if name.is_in_future(disk_consistent_lsn) {
|
||||
Err(DismissedLayer::Future { local })
|
||||
} else {
|
||||
@@ -177,7 +165,7 @@ pub(super) fn reconcile(
|
||||
}
|
||||
};
|
||||
|
||||
(name, local_path, decision)
|
||||
(name, decision)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
@@ -205,24 +205,6 @@ impl LayerManager {
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub(crate) fn rewrite_layers(
|
||||
&mut self,
|
||||
rewrite_layers: &[(Layer, ResidentLayer)],
|
||||
drop_layers: &[Layer],
|
||||
_metrics: &TimelineMetrics,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
|
||||
// TODO: implement rewrites (currently this code path only used for drops)
|
||||
assert!(rewrite_layers.is_empty());
|
||||
|
||||
for l in drop_layers {
|
||||
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when garbage collect has selected the layers to be removed.
|
||||
pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
|
||||
21
poetry.lock
generated
21
poetry.lock
generated
@@ -1001,17 +1001,18 @@ dotenv = ["python-dotenv"]
|
||||
|
||||
[[package]]
|
||||
name = "flask-cors"
|
||||
version = "4.0.1"
|
||||
version = "3.0.10"
|
||||
description = "A Flask extension adding a decorator for CORS support"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
|
||||
{file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
|
||||
{file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"},
|
||||
{file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Flask = ">=0.9"
|
||||
Six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "frozenlist"
|
||||
@@ -1242,13 +1243,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "jinja2"
|
||||
version = "3.1.4"
|
||||
version = "3.1.3"
|
||||
description = "A very fast and expressive template engine."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
|
||||
{file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
|
||||
{file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
|
||||
{file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2611,13 +2612,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.0.3"
|
||||
version = "3.0.1"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
|
||||
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
|
||||
{file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
|
||||
{file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2899,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb"
|
||||
content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
|
||||
|
||||
@@ -9,6 +9,8 @@ default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
workspace_hack.workspace = true
|
||||
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -40,13 +42,13 @@ hyper.workspace = true
|
||||
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
http-body-util = { version = "0.1" }
|
||||
indexmap.workspace = true
|
||||
ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
lasso = { workspace = true, features = ["multi-threaded"] }
|
||||
md5.workspace = true
|
||||
measured = { workspace = true, features = ["lasso"] }
|
||||
metrics.workspace = true
|
||||
moka = { version = "0.12.7", features = ["future"] }
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
@@ -101,8 +103,6 @@ postgres-native-tls.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
redis.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
|
||||
@@ -69,8 +69,10 @@ pub enum BackendType<'a, T, D> {
|
||||
Link(MaybeOwned<'a, url::ApiUrl>, D),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[async_trait::async_trait]
|
||||
pub trait TestBackend: Send + Sync + 'static {
|
||||
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
|
||||
async fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
|
||||
fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
|
||||
@@ -153,7 +155,13 @@ pub struct ComputeUserInfo {
|
||||
|
||||
impl ComputeUserInfo {
|
||||
pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
|
||||
self.options.get_cache_key(&self.endpoint)
|
||||
let id = EndpointIdInt::from(&self.endpoint);
|
||||
let key = EndpointCacheKey::from(id);
|
||||
if self.options.is_empty() {
|
||||
key
|
||||
} else {
|
||||
key.with_options(self.options.to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -343,7 +351,7 @@ async fn auth_quirks(
|
||||
Err(e) => {
|
||||
if e.is_auth_failed() {
|
||||
// The password could have been changed, so we invalidate the cache.
|
||||
cached_entry.invalidate();
|
||||
cached_entry.invalidate().await;
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
|
||||
@@ -292,7 +292,7 @@ mod tests {
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
|
||||
assert_eq!(user_info.options.get_cache_key("foo"), "foo");
|
||||
assert_eq!(user_info.options.to_string(), "");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -451,8 +451,8 @@ mod tests {
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
|
||||
assert_eq!(
|
||||
user_info.options.get_cache_key("project"),
|
||||
"project endpoint_type:read_write lsn:0/2"
|
||||
user_info.options.to_string(),
|
||||
"endpoint_type:read_write lsn:0/2"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -27,7 +27,6 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
use proxy::redis::notifications;
|
||||
use proxy::serverless::cancel_set::CancelSet;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::usage_metrics;
|
||||
|
||||
@@ -244,12 +243,6 @@ struct SqlOverHttpArgs {
|
||||
/// increase memory used by the pool
|
||||
#[clap(long, default_value_t = 128)]
|
||||
sql_over_http_pool_shards: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10000)]
|
||||
sql_over_http_client_conn_threshold: u64,
|
||||
|
||||
#[clap(long, default_value_t = 64)]
|
||||
sql_over_http_cancel_set_shards: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -349,7 +342,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
let redis_publisher = match &redis_notifications_client {
|
||||
let redis_publisher = match ®ional_redis_client {
|
||||
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
redis_publisher.clone(),
|
||||
args.region.clone(),
|
||||
@@ -606,8 +599,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
|
||||
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
},
|
||||
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
pub mod common;
|
||||
pub mod endpoints;
|
||||
pub mod project_info;
|
||||
mod timed_lru;
|
||||
|
||||
pub use common::{Cache, Cached};
|
||||
pub use timed_lru::TimedLru;
|
||||
|
||||
40
proxy/src/cache/common.rs
vendored
40
proxy/src/cache/common.rs
vendored
@@ -3,35 +3,28 @@ use std::ops::{Deref, DerefMut};
|
||||
/// A generic trait which exposes types of cache's key and value,
|
||||
/// as well as the notion of cache entry invalidation.
|
||||
/// This is useful for [`Cached`].
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait Cache {
|
||||
/// Entry's key.
|
||||
type Key;
|
||||
|
||||
/// Entry's value.
|
||||
type Value;
|
||||
|
||||
/// Used for entry invalidation.
|
||||
type LookupInfo<Key>;
|
||||
type LookupInfo;
|
||||
|
||||
/// Invalidate an entry using a lookup info.
|
||||
/// We don't have an empty default impl because it's error-prone.
|
||||
fn invalidate(&self, _: &Self::LookupInfo<Self::Key>);
|
||||
async fn invalidate(&self, _: &Self::LookupInfo);
|
||||
}
|
||||
|
||||
impl<C: Cache> Cache for &C {
|
||||
type Key = C::Key;
|
||||
type Value = C::Value;
|
||||
type LookupInfo<Key> = C::LookupInfo<Key>;
|
||||
type LookupInfo = C::LookupInfo;
|
||||
|
||||
fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
|
||||
C::invalidate(self, info)
|
||||
async fn invalidate(&self, info: &Self::LookupInfo) {
|
||||
C::invalidate(self, info).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for convenient entry invalidation.
|
||||
pub struct Cached<C: Cache, V = <C as Cache>::Value> {
|
||||
pub struct Cached<C: Cache, V> {
|
||||
/// Cache + lookup info.
|
||||
pub token: Option<(C, C::LookupInfo<C::Key>)>,
|
||||
pub token: Option<(C, C::LookupInfo)>,
|
||||
|
||||
/// The value itself.
|
||||
pub value: V,
|
||||
@@ -54,9 +47,9 @@ impl<C: Cache, V> Cached<C, V> {
|
||||
}
|
||||
|
||||
/// Drop this entry from a cache if it's still there.
|
||||
pub fn invalidate(self) -> V {
|
||||
pub async fn invalidate(self) -> V {
|
||||
if let Some((cache, info)) = &self.token {
|
||||
cache.invalidate(info);
|
||||
cache.invalidate(info).await;
|
||||
}
|
||||
self.value
|
||||
}
|
||||
@@ -80,3 +73,16 @@ impl<C: Cache, V> DerefMut for Cached<C, V> {
|
||||
&mut self.value
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V, S> Cache for moka::future::Cache<K, V, S>
|
||||
where
|
||||
K: std::hash::Hash + Eq + Send + Sync + 'static,
|
||||
V: Clone + Send + Sync + 'static,
|
||||
S: std::hash::BuildHasher + Clone + Send + Sync + 'static,
|
||||
{
|
||||
type LookupInfo = K;
|
||||
|
||||
async fn invalidate(&self, key: &Self::LookupInfo) {
|
||||
moka::future::Cache::invalidate(self, key).await
|
||||
}
|
||||
}
|
||||
|
||||
11
proxy/src/cache/project_info.rs
vendored
11
proxy/src/cache/project_info.rs
vendored
@@ -8,7 +8,6 @@ use std::{
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use rand::{thread_rng, Rng};
|
||||
use smol_str::SmolStr;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::time::Instant;
|
||||
use tracing::{debug, info};
|
||||
@@ -346,13 +345,9 @@ enum LookupType {
|
||||
}
|
||||
|
||||
impl Cache for ProjectInfoCacheImpl {
|
||||
type Key = SmolStr;
|
||||
// Value is not really used here, but we need to specify it.
|
||||
type Value = SmolStr;
|
||||
type LookupInfo = CachedLookupInfo;
|
||||
|
||||
type LookupInfo<Key> = CachedLookupInfo;
|
||||
|
||||
fn invalidate(&self, key: &Self::LookupInfo<SmolStr>) {
|
||||
async fn invalidate(&self, key: &Self::LookupInfo) {
|
||||
match &key.lookup_type {
|
||||
LookupType::RoleSecret(role_name) => {
|
||||
if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
|
||||
@@ -489,7 +484,7 @@ mod tests {
|
||||
assert!(!cached.cached());
|
||||
assert_eq!(cached.value, secret1);
|
||||
|
||||
cached.invalidate(); // Shouldn't do anything.
|
||||
cached.invalidate().await; // Shouldn't do anything.
|
||||
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
|
||||
assert_eq!(cached.value, secret1);
|
||||
|
||||
|
||||
258
proxy/src/cache/timed_lru.rs
vendored
258
proxy/src/cache/timed_lru.rs
vendored
@@ -1,258 +0,0 @@
|
||||
use std::{
|
||||
borrow::Borrow,
|
||||
hash::Hash,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tracing::debug;
|
||||
|
||||
// This seems to make more sense than `lru` or `cached`:
|
||||
//
|
||||
// * `near/nearcore` ditched `cached` in favor of `lru`
|
||||
// (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed).
|
||||
//
|
||||
// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs).
|
||||
// This severely hinders its usage both in terms of creating wrappers and supported key types.
|
||||
//
|
||||
// On the other hand, `hashlink` has good download stats and appears to be maintained.
|
||||
use hashlink::{linked_hash_map::RawEntryMut, LruCache};
|
||||
|
||||
use super::{common::Cached, *};
|
||||
|
||||
/// An implementation of timed LRU cache with fixed capacity.
|
||||
/// Key properties:
|
||||
///
|
||||
/// * Whenever a new entry is inserted, the least recently accessed one is evicted.
|
||||
/// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`).
|
||||
///
|
||||
/// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp.
|
||||
/// If the entry has expired, we remove it from the cache; Otherwise we bump the
|
||||
/// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong
|
||||
/// its existence.
|
||||
///
|
||||
/// * There's an API for immediate invalidation (removal) of a cache entry;
|
||||
/// It's useful in case we know for sure that the entry is no longer correct.
|
||||
/// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information.
|
||||
///
|
||||
/// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
|
||||
/// or by a successful lookup (i.e. the entry hasn't expired yet).
|
||||
/// There is no background job to reap the expired records.
|
||||
///
|
||||
/// * It's possible for an entry that has not yet expired entry to be evicted
|
||||
/// before expired items. That's a bit wasteful, but probably fine in practice.
|
||||
pub struct TimedLru<K, V> {
|
||||
/// Cache's name for tracing.
|
||||
name: &'static str,
|
||||
|
||||
/// The underlying cache implementation.
|
||||
cache: parking_lot::Mutex<LruCache<K, Entry<V>>>,
|
||||
|
||||
/// Default time-to-live of a single entry.
|
||||
ttl: Duration,
|
||||
|
||||
update_ttl_on_retrieval: bool,
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
|
||||
type Key = K;
|
||||
type Value = V;
|
||||
type LookupInfo<Key> = LookupInfo<Key>;
|
||||
|
||||
fn invalidate(&self, info: &Self::LookupInfo<K>) {
|
||||
self.invalidate_raw(info)
|
||||
}
|
||||
}
|
||||
|
||||
struct Entry<T> {
|
||||
created_at: Instant,
|
||||
expires_at: Instant,
|
||||
value: T,
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq, V> TimedLru<K, V> {
|
||||
/// Construct a new LRU cache with timed entries.
|
||||
pub fn new(
|
||||
name: &'static str,
|
||||
capacity: usize,
|
||||
ttl: Duration,
|
||||
update_ttl_on_retrieval: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
cache: LruCache::new(capacity).into(),
|
||||
ttl,
|
||||
update_ttl_on_retrieval,
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop an entry from the cache if it's outdated.
|
||||
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
|
||||
fn invalidate_raw(&self, info: &LookupInfo<K>) {
|
||||
let now = Instant::now();
|
||||
|
||||
// Do costly things before taking the lock.
|
||||
let mut cache = self.cache.lock();
|
||||
let raw_entry = match cache.raw_entry_mut().from_key(&info.key) {
|
||||
RawEntryMut::Vacant(_) => return,
|
||||
RawEntryMut::Occupied(x) => x,
|
||||
};
|
||||
|
||||
// Remove the entry if it was created prior to lookup timestamp.
|
||||
let entry = raw_entry.get();
|
||||
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
|
||||
let should_remove = created_at <= info.created_at || expires_at <= now;
|
||||
|
||||
if should_remove {
|
||||
raw_entry.remove();
|
||||
}
|
||||
|
||||
drop(cache); // drop lock before logging
|
||||
debug!(
|
||||
created_at = format_args!("{created_at:?}"),
|
||||
expires_at = format_args!("{expires_at:?}"),
|
||||
entry_removed = should_remove,
|
||||
"processed a cache entry invalidation event"
|
||||
);
|
||||
}
|
||||
|
||||
/// Try retrieving an entry by its key, then execute `extract` if it exists.
|
||||
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
|
||||
fn get_raw<Q, R>(&self, key: &Q, extract: impl FnOnce(&K, &Entry<V>) -> R) -> Option<R>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
let now = Instant::now();
|
||||
let deadline = now.checked_add(self.ttl).expect("time overflow");
|
||||
|
||||
// Do costly things before taking the lock.
|
||||
let mut cache = self.cache.lock();
|
||||
let mut raw_entry = match cache.raw_entry_mut().from_key(key) {
|
||||
RawEntryMut::Vacant(_) => return None,
|
||||
RawEntryMut::Occupied(x) => x,
|
||||
};
|
||||
|
||||
// Immeditely drop the entry if it has expired.
|
||||
let entry = raw_entry.get();
|
||||
if entry.expires_at <= now {
|
||||
raw_entry.remove();
|
||||
return None;
|
||||
}
|
||||
|
||||
let value = extract(raw_entry.key(), entry);
|
||||
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
|
||||
|
||||
// Update the deadline and the entry's position in the LRU list.
|
||||
if self.update_ttl_on_retrieval {
|
||||
raw_entry.get_mut().expires_at = deadline;
|
||||
}
|
||||
raw_entry.to_back();
|
||||
|
||||
drop(cache); // drop lock before logging
|
||||
debug!(
|
||||
created_at = format_args!("{created_at:?}"),
|
||||
old_expires_at = format_args!("{expires_at:?}"),
|
||||
new_expires_at = format_args!("{deadline:?}"),
|
||||
"accessed a cache entry"
|
||||
);
|
||||
|
||||
Some(value)
|
||||
}
|
||||
|
||||
/// Insert an entry to the cache. If an entry with the same key already
|
||||
/// existed, return the previous value and its creation timestamp.
|
||||
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
|
||||
fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
|
||||
let created_at = Instant::now();
|
||||
let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
|
||||
|
||||
let entry = Entry {
|
||||
created_at,
|
||||
expires_at,
|
||||
value,
|
||||
};
|
||||
|
||||
// Do costly things before taking the lock.
|
||||
let old = self
|
||||
.cache
|
||||
.lock()
|
||||
.insert(key, entry)
|
||||
.map(|entry| entry.value);
|
||||
|
||||
debug!(
|
||||
created_at = format_args!("{created_at:?}"),
|
||||
expires_at = format_args!("{expires_at:?}"),
|
||||
replaced = old.is_some(),
|
||||
"created a cache entry"
|
||||
);
|
||||
|
||||
(created_at, old)
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
|
||||
pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
|
||||
let (created_at, old) = self.insert_raw(key.clone(), value.clone());
|
||||
|
||||
let cached = Cached {
|
||||
token: Some((self, LookupInfo { created_at, key })),
|
||||
value,
|
||||
};
|
||||
|
||||
(old, cached)
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
|
||||
/// Retrieve a cached entry in convenient wrapper.
|
||||
pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
|
||||
where
|
||||
K: Borrow<Q> + Clone,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
self.get_raw(key, |key, entry| {
|
||||
let info = LookupInfo {
|
||||
created_at: entry.created_at,
|
||||
key: key.clone(),
|
||||
};
|
||||
|
||||
Cached {
|
||||
token: Some((self, info)),
|
||||
value: entry.value.clone(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Retrieve a cached entry in convenient wrapper, ignoring its TTL.
|
||||
pub fn get_ignoring_ttl<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
let mut cache = self.cache.lock();
|
||||
cache
|
||||
.get(key)
|
||||
.map(|entry| Cached::new_uncached(entry.value.clone()))
|
||||
}
|
||||
|
||||
/// Remove an entry from the cache.
|
||||
pub fn remove<Q>(&self, key: &Q) -> Option<V>
|
||||
where
|
||||
K: Borrow<Q> + Clone,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
let mut cache = self.cache.lock();
|
||||
cache.remove(key).map(|entry| entry.value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Lookup information for key invalidation.
|
||||
pub struct LookupInfo<K> {
|
||||
/// Time of creation of a cache [`Entry`].
|
||||
/// We use this during invalidation lookups to prevent eviction of a newer
|
||||
/// entry sharing the same key (it might've been inserted by a different
|
||||
/// task after we got the entry we're trying to invalidate now).
|
||||
created_at: Instant,
|
||||
|
||||
/// Search by this key.
|
||||
key: K,
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::{
|
||||
auth::parse_endpoint_param,
|
||||
cancellation::CancelClosure,
|
||||
console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError},
|
||||
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
|
||||
context::RequestMonitoring,
|
||||
error::{ReportableError, UserFacingError},
|
||||
metrics::{Metrics, NumDbConnectionsGuard},
|
||||
@@ -34,9 +34,6 @@ pub enum ConnectionError {
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
WakeComputeError(#[from] WakeComputeError),
|
||||
|
||||
#[error("error acquiring resource permit: {0}")]
|
||||
TooManyConnectionAttempts(#[from] ApiLockError),
|
||||
}
|
||||
|
||||
impl UserFacingError for ConnectionError {
|
||||
@@ -60,9 +57,6 @@ impl UserFacingError for ConnectionError {
|
||||
None => err.to_string(),
|
||||
},
|
||||
WakeComputeError(err) => err.to_string_client(),
|
||||
TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
_ => COULD_NOT_CONNECT.to_owned(),
|
||||
}
|
||||
}
|
||||
@@ -78,7 +72,6 @@ impl ReportableError for ConnectionError {
|
||||
ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::WakeComputeError(e) => e.get_error_kind(),
|
||||
ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::{
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
console::locks::ApiLocks,
|
||||
rate_limiter::RateBucketInfo,
|
||||
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
serverless::GlobalConnPoolOptions,
|
||||
Host,
|
||||
};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
@@ -56,8 +56,6 @@ pub struct TlsConfig {
|
||||
pub struct HttpConfig {
|
||||
pub request_timeout: tokio::time::Duration,
|
||||
pub pool_options: GlobalConnPoolOptions,
|
||||
pub cancel_set: CancelSet,
|
||||
pub client_conn_threshold: u64,
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
@@ -413,7 +411,7 @@ pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfi
|
||||
#[derive(Debug)]
|
||||
pub struct CacheOptions {
|
||||
/// Max number of entries.
|
||||
pub size: usize,
|
||||
pub size: u64,
|
||||
/// Entry's time-to-live.
|
||||
pub ttl: Duration,
|
||||
}
|
||||
|
||||
@@ -8,11 +8,10 @@ use crate::{
|
||||
backend::{ComputeCredentialKeys, ComputeUserInfo},
|
||||
IpPattern,
|
||||
},
|
||||
cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
|
||||
cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached},
|
||||
compute,
|
||||
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
|
||||
context::RequestMonitoring,
|
||||
error::ReportableError,
|
||||
intern::ProjectIdInt,
|
||||
metrics::ApiLockMetrics,
|
||||
scram, EndpointCacheKey,
|
||||
@@ -31,8 +30,6 @@ pub mod errors {
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
use super::ApiLockError;
|
||||
|
||||
/// A go-to error message which doesn't leak any detail.
|
||||
const REQUEST_FAILED: &str = "Console request failed";
|
||||
|
||||
@@ -214,8 +211,8 @@ pub mod errors {
|
||||
#[error("Too many connections attempts")]
|
||||
TooManyConnections,
|
||||
|
||||
#[error("error acquiring resource permit: {0}")]
|
||||
TooManyConnectionAttempts(#[from] ApiLockError),
|
||||
#[error("Timeout waiting to acquire wake compute lock")]
|
||||
TimeoutError,
|
||||
}
|
||||
|
||||
// This allows more useful interactions than `#[from]`.
|
||||
@@ -225,6 +222,17 @@ pub mod errors {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<tokio::sync::AcquireError> for WakeComputeError {
|
||||
fn from(_: tokio::sync::AcquireError) -> Self {
|
||||
WakeComputeError::TimeoutError
|
||||
}
|
||||
}
|
||||
impl From<tokio::time::error::Elapsed> for WakeComputeError {
|
||||
fn from(_: tokio::time::error::Elapsed) -> Self {
|
||||
WakeComputeError::TimeoutError
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for WakeComputeError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use WakeComputeError::*;
|
||||
@@ -237,9 +245,7 @@ pub mod errors {
|
||||
|
||||
TooManyConnections => self.to_string(),
|
||||
|
||||
TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -250,7 +256,7 @@ pub mod errors {
|
||||
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
|
||||
WakeComputeError::ApiError(e) => e.get_error_kind(),
|
||||
WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
|
||||
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -320,8 +326,8 @@ impl NodeInfo {
|
||||
}
|
||||
}
|
||||
|
||||
pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
|
||||
pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
|
||||
pub type NodeInfoCache = moka::future::Cache<EndpointCacheKey, NodeInfo>;
|
||||
pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
|
||||
pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
|
||||
pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
|
||||
|
||||
@@ -406,7 +412,7 @@ impl Api for ConsoleBackend {
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Postgres(api) => api.wake_compute(ctx, user_info).await,
|
||||
#[cfg(test)]
|
||||
Test(api) => api.wake_compute(),
|
||||
Test(api) => api.wake_compute().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -428,12 +434,11 @@ impl ApiCaches {
|
||||
endpoint_cache_config: EndpointCacheConfig,
|
||||
) -> Self {
|
||||
Self {
|
||||
node_info: NodeInfoCache::new(
|
||||
"node_info_cache",
|
||||
wake_compute_cache_config.size,
|
||||
wake_compute_cache_config.ttl,
|
||||
true,
|
||||
),
|
||||
node_info: moka::future::Cache::builder()
|
||||
.max_capacity(wake_compute_cache_config.size)
|
||||
.time_to_idle(wake_compute_cache_config.ttl)
|
||||
.name("node_info_cache")
|
||||
.build(),
|
||||
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
|
||||
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
|
||||
}
|
||||
@@ -450,23 +455,6 @@ pub struct ApiLocks<K> {
|
||||
metrics: &'static ApiLockMetrics,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ApiLockError {
|
||||
#[error("lock was closed")]
|
||||
AcquireError(#[from] tokio::sync::AcquireError),
|
||||
#[error("permit could not be acquired")]
|
||||
TimeoutError(#[from] tokio::time::error::Elapsed),
|
||||
}
|
||||
|
||||
impl ReportableError for ApiLockError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
|
||||
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
pub fn new(
|
||||
name: &'static str,
|
||||
@@ -486,7 +474,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
|
||||
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
|
||||
if self.permits == 0 {
|
||||
return Ok(WakeComputePermit { permit: None });
|
||||
}
|
||||
|
||||
@@ -275,10 +275,13 @@ impl super::Api for Api {
|
||||
// for some time (highly depends on the console's scale-to-zero policy);
|
||||
// The connection info remains the same during that period of time,
|
||||
// which means that we might cache it to reduce the load and latency.
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
if let Some(cached) = self.caches.node_info.get(&key).await {
|
||||
info!(key = %key, "found cached compute node info");
|
||||
ctx.set_project(cached.aux.clone());
|
||||
return Ok(cached);
|
||||
return Ok(CachedNodeInfo {
|
||||
token: Some((&self.caches.node_info, key)),
|
||||
value: cached,
|
||||
});
|
||||
}
|
||||
|
||||
// check rate limit
|
||||
@@ -294,10 +297,13 @@ impl super::Api for Api {
|
||||
// after getting back a permit - it's possible the cache was filled
|
||||
// double check
|
||||
if permit.should_check_cache() {
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
if let Some(cached) = self.caches.node_info.get(&key).await {
|
||||
info!(key = %key, "found cached compute node info");
|
||||
ctx.set_project(cached.aux.clone());
|
||||
return Ok(cached);
|
||||
return Ok(CachedNodeInfo {
|
||||
token: Some((&self.caches.node_info, key)),
|
||||
value: cached,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -308,12 +314,18 @@ impl super::Api for Api {
|
||||
|
||||
// store the cached node as 'warm'
|
||||
node.aux.cold_start_info = ColdStartInfo::WarmCached;
|
||||
let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
|
||||
cached.aux.cold_start_info = cold_start_info;
|
||||
self.caches
|
||||
.node_info
|
||||
.insert(key.clone(), node.clone())
|
||||
.await;
|
||||
node.aux.cold_start_info = cold_start_info;
|
||||
|
||||
info!(key = &*key, "created a cache entry for compute node info");
|
||||
info!(key = %key, "created a cache entry for compute node info");
|
||||
|
||||
Ok(cached)
|
||||
Ok(CachedNodeInfo {
|
||||
token: Some((&self.caches.node_info, key)),
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -154,9 +154,6 @@ smol_str_wrapper!(BranchId);
|
||||
// 90% of project strings are 23 characters or less.
|
||||
smol_str_wrapper!(ProjectId);
|
||||
|
||||
// will usually equal endpoint ID
|
||||
smol_str_wrapper!(EndpointCacheKey);
|
||||
|
||||
smol_str_wrapper!(DbName);
|
||||
|
||||
// postgres hostname, will likely be a port:ip addr
|
||||
@@ -180,3 +177,35 @@ impl EndpointId {
|
||||
ProjectId(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Hash, PartialEq, Eq, Debug, Clone)]
|
||||
pub struct EndpointCacheKey {
|
||||
endpoint: intern::EndpointIdInt,
|
||||
options: Option<String>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EndpointCacheKey {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.endpoint.as_str())?;
|
||||
if let Some(options) = &self.options {
|
||||
f.write_str(" ")?;
|
||||
f.write_str(options)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<intern::EndpointIdInt> for EndpointCacheKey {
|
||||
fn from(value: intern::EndpointIdInt) -> Self {
|
||||
Self {
|
||||
endpoint: value,
|
||||
options: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl EndpointCacheKey {
|
||||
pub fn with_options(mut self, options: String) -> Self {
|
||||
self.options = Some(options);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,6 @@ use crate::{
|
||||
protocol2::read_proxy_protocol,
|
||||
proxy::handshake::{handshake, HandshakeData},
|
||||
stream::{PqStream, Stream},
|
||||
EndpointCacheKey,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
@@ -391,13 +390,8 @@ impl NeonOptions {
|
||||
Self(options)
|
||||
}
|
||||
|
||||
pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
|
||||
// prefix + format!(" {k}:{v}")
|
||||
// kinda jank because SmolStr is immutable
|
||||
std::iter::once(prefix)
|
||||
.chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v]))
|
||||
.collect::<SmolStr>()
|
||||
.into()
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
/// <https://swagger.io/docs/specification/serialization/> DeepObject format
|
||||
@@ -418,3 +412,20 @@ pub fn neon_option(bytes: &str) -> Option<(&str, &str)> {
|
||||
let (_, [k, v]) = cap.extract();
|
||||
Some((k, v))
|
||||
}
|
||||
|
||||
impl std::fmt::Display for NeonOptions {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let mut space = false;
|
||||
for (k, v) in &self.0 {
|
||||
if space {
|
||||
f.write_str(" ")?;
|
||||
} else {
|
||||
space = true;
|
||||
}
|
||||
f.write_str(k)?;
|
||||
f.write_str(":")?;
|
||||
f.write_str(v)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
|
||||
/// (e.g. the compute node's address might've changed at the wrong time).
|
||||
/// Invalidate the cache entry (if any) to prevent subsequent errors.
|
||||
#[tracing::instrument(name = "invalidate_cache", skip_all)]
|
||||
pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
|
||||
pub async fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
|
||||
let is_cached = node_info.cached();
|
||||
if is_cached {
|
||||
warn!("invalidating stalled compute node info cache entry");
|
||||
@@ -34,7 +34,7 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
|
||||
};
|
||||
Metrics::get().proxy.connection_failures_total.inc(label);
|
||||
|
||||
node_info.invalidate()
|
||||
node_info.invalidate().await
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -156,7 +156,7 @@ where
|
||||
} else {
|
||||
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
|
||||
info!("compute node's state has likely changed; requesting a wake-up");
|
||||
let old_node_info = invalidate_cache(node_info);
|
||||
let old_node_info = invalidate_cache(node_info).await;
|
||||
let mut node_info =
|
||||
wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
|
||||
node_info.reuse_settings(old_node_info);
|
||||
|
||||
@@ -86,8 +86,6 @@ impl ShouldRetry for compute::ConnectionError {
|
||||
match self {
|
||||
compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
|
||||
compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
|
||||
// the cache entry was not checked for validity
|
||||
compute::ConnectionError::TooManyConnectionAttempts(_) => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,9 @@ use crate::console::messages::MetricsAuxInfo;
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::proxy::retry::retry_after;
|
||||
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
|
||||
use crate::{http, sasl, scram, BranchId, EndpointCacheKey, EndpointId, ProjectId};
|
||||
use anyhow::{bail, Context};
|
||||
use async_trait::async_trait;
|
||||
use rstest::rstest;
|
||||
@@ -405,12 +406,13 @@ impl TestConnectMechanism {
|
||||
Self {
|
||||
counter: Arc::new(std::sync::Mutex::new(0)),
|
||||
sequence,
|
||||
cache: Box::leak(Box::new(NodeInfoCache::new(
|
||||
"test",
|
||||
1,
|
||||
Duration::from_secs(100),
|
||||
false,
|
||||
))),
|
||||
cache: Box::leak(Box::new(
|
||||
NodeInfoCache::builder()
|
||||
.name("test")
|
||||
.max_capacity(1)
|
||||
.time_to_live(Duration::from_secs(100))
|
||||
.build(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -476,13 +478,17 @@ impl ConnectMechanism for TestConnectMechanism {
|
||||
fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl TestBackend for TestConnectMechanism {
|
||||
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
let mut counter = self.counter.lock().unwrap();
|
||||
let action = self.sequence[*counter];
|
||||
*counter += 1;
|
||||
async fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
let action = {
|
||||
let mut counter = self.counter.lock().unwrap();
|
||||
let action = self.sequence[*counter];
|
||||
*counter += 1;
|
||||
action
|
||||
};
|
||||
match action {
|
||||
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
|
||||
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache).await),
|
||||
ConnectAction::WakeFail => {
|
||||
let err = console::errors::ApiError::Console {
|
||||
status: http::StatusCode::FORBIDDEN,
|
||||
@@ -514,7 +520,7 @@ impl TestBackend for TestConnectMechanism {
|
||||
}
|
||||
}
|
||||
|
||||
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
||||
async fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
||||
let node = NodeInfo {
|
||||
config: compute::ConnCfg::new(),
|
||||
aux: MetricsAuxInfo {
|
||||
@@ -525,8 +531,14 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
|
||||
},
|
||||
allow_self_signed_compute: false,
|
||||
};
|
||||
let (_, node) = cache.insert("key".into(), node);
|
||||
node
|
||||
let ep: EndpointId = "key".into();
|
||||
let ep = EndpointIdInt::from(ep);
|
||||
let key = EndpointCacheKey::from(ep);
|
||||
cache.insert(key.clone(), node.clone()).await;
|
||||
CachedNodeInfo {
|
||||
token: Some((cache, key)),
|
||||
value: node,
|
||||
}
|
||||
}
|
||||
|
||||
fn helper_create_connect_info(
|
||||
|
||||
@@ -119,7 +119,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
WakeupFailureKind::ApiConsoleOtherError
|
||||
}
|
||||
WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
|
||||
WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
|
||||
WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
|
||||
};
|
||||
Metrics::get()
|
||||
.proxy
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//! Handles both SQL over HTTP and SQL over Websockets.
|
||||
|
||||
mod backend;
|
||||
pub mod cancel_set;
|
||||
mod conn_pool;
|
||||
mod http_util;
|
||||
mod json;
|
||||
@@ -110,37 +109,20 @@ pub async fn task_main(
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
|
||||
|
||||
let n_connections = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
.sample(crate::metrics::Protocol::Http);
|
||||
tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check");
|
||||
if n_connections > config.http_config.client_conn_threshold {
|
||||
tracing::trace!("attempting to cancel a random connection");
|
||||
if let Some(token) = config.http_config.cancel_set.take() {
|
||||
tracing::debug!("cancelling a random connection");
|
||||
token.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
let conn_token = cancellation_token.child_token();
|
||||
let conn = connection_handler(
|
||||
config,
|
||||
backend.clone(),
|
||||
connections.clone(),
|
||||
cancellation_handler.clone(),
|
||||
conn_token.clone(),
|
||||
server.clone(),
|
||||
tls_acceptor.clone(),
|
||||
conn,
|
||||
peer_addr,
|
||||
)
|
||||
.instrument(http_conn_span);
|
||||
|
||||
connections.spawn(async move {
|
||||
let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token);
|
||||
conn.await
|
||||
});
|
||||
connections.spawn(
|
||||
connection_handler(
|
||||
config,
|
||||
backend.clone(),
|
||||
connections.clone(),
|
||||
cancellation_handler.clone(),
|
||||
cancellation_token.clone(),
|
||||
server.clone(),
|
||||
tls_acceptor.clone(),
|
||||
conn,
|
||||
peer_addr,
|
||||
)
|
||||
.instrument(http_conn_span),
|
||||
);
|
||||
}
|
||||
|
||||
connections.wait().await;
|
||||
@@ -261,7 +243,6 @@ async fn connection_handler(
|
||||
// On cancellation, trigger the HTTP connection handler to shut down.
|
||||
let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
|
||||
Either::Left((_cancelled, mut conn)) => {
|
||||
tracing::debug!(%peer_addr, "cancelling connection");
|
||||
conn.as_mut().graceful_shutdown();
|
||||
conn.await
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@ use crate::{
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
locks::ApiLocks,
|
||||
provider::ApiLockError,
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
@@ -132,8 +131,6 @@ pub enum HttpConnError {
|
||||
AuthError(#[from] AuthError),
|
||||
#[error("wake_compute returned error")]
|
||||
WakeCompute(#[from] WakeComputeError),
|
||||
#[error("error acquiring resource permit: {0}")]
|
||||
TooManyConnectionAttempts(#[from] ApiLockError),
|
||||
}
|
||||
|
||||
impl ReportableError for HttpConnError {
|
||||
@@ -144,7 +141,6 @@ impl ReportableError for HttpConnError {
|
||||
HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
|
||||
HttpConnError::AuthError(a) => a.get_error_kind(),
|
||||
HttpConnError::WakeCompute(w) => w.get_error_kind(),
|
||||
HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -157,9 +153,6 @@ impl UserFacingError for HttpConnError {
|
||||
HttpConnError::GetAuthInfo(c) => c.to_string_client(),
|
||||
HttpConnError::AuthError(c) => c.to_string_client(),
|
||||
HttpConnError::WakeCompute(c) => c.to_string_client(),
|
||||
HttpConnError::TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -172,15 +165,6 @@ impl ShouldRetry for HttpConnError {
|
||||
HttpConnError::GetAuthInfo(_) => false,
|
||||
HttpConnError::AuthError(_) => false,
|
||||
HttpConnError::WakeCompute(_) => false,
|
||||
HttpConnError::TooManyConnectionAttempts(_) => false,
|
||||
}
|
||||
}
|
||||
fn should_retry_database_address(&self) -> bool {
|
||||
match self {
|
||||
HttpConnError::ConnectionError(e) => e.should_retry_database_address(),
|
||||
// we never checked cache validity
|
||||
HttpConnError::TooManyConnectionAttempts(_) => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,102 +0,0 @@
|
||||
//! A set for cancelling random http connections
|
||||
|
||||
use std::{
|
||||
hash::{BuildHasher, BuildHasherDefault},
|
||||
num::NonZeroUsize,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use indexmap::IndexMap;
|
||||
use parking_lot::Mutex;
|
||||
use rand::{thread_rng, Rng};
|
||||
use rustc_hash::FxHasher;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use uuid::Uuid;
|
||||
|
||||
type Hasher = BuildHasherDefault<FxHasher>;
|
||||
|
||||
pub struct CancelSet {
|
||||
shards: Box<[Mutex<CancelShard>]>,
|
||||
// keyed by random uuid, fxhasher is fine
|
||||
hasher: Hasher,
|
||||
}
|
||||
|
||||
pub struct CancelShard {
|
||||
tokens: IndexMap<uuid::Uuid, (Instant, CancellationToken), Hasher>,
|
||||
}
|
||||
|
||||
impl CancelSet {
|
||||
pub fn new(shards: usize) -> Self {
|
||||
CancelSet {
|
||||
shards: (0..shards)
|
||||
.map(|_| {
|
||||
Mutex::new(CancelShard {
|
||||
tokens: IndexMap::with_hasher(Hasher::default()),
|
||||
})
|
||||
})
|
||||
.collect(),
|
||||
hasher: Hasher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn take(&self) -> Option<CancellationToken> {
|
||||
for _ in 0..4 {
|
||||
if let Some(token) = self.take_raw(thread_rng().gen()) {
|
||||
return Some(token);
|
||||
}
|
||||
tracing::trace!("failed to get cancel token");
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
|
||||
NonZeroUsize::new(self.shards.len())
|
||||
.and_then(|len| self.shards[rng % len].lock().take(rng / len))
|
||||
}
|
||||
|
||||
pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
|
||||
let shard = NonZeroUsize::new(self.shards.len()).map(|len| {
|
||||
let hash = self.hasher.hash_one(id) as usize;
|
||||
let shard = &self.shards[hash % len];
|
||||
shard.lock().insert(id, token);
|
||||
shard
|
||||
});
|
||||
CancelGuard { shard, id }
|
||||
}
|
||||
}
|
||||
|
||||
impl CancelShard {
|
||||
fn take(&mut self, rng: usize) -> Option<CancellationToken> {
|
||||
NonZeroUsize::new(self.tokens.len()).and_then(|len| {
|
||||
// 10 second grace period so we don't cancel new connections
|
||||
if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?;
|
||||
Some(token)
|
||||
})
|
||||
}
|
||||
|
||||
fn remove(&mut self, id: uuid::Uuid) {
|
||||
self.tokens.swap_remove(&id);
|
||||
}
|
||||
|
||||
fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) {
|
||||
self.tokens.insert(id, (Instant::now(), token));
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CancelGuard<'a> {
|
||||
shard: Option<&'a Mutex<CancelShard>>,
|
||||
id: Uuid,
|
||||
}
|
||||
|
||||
impl Drop for CancelGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some(shard) = self.shard {
|
||||
shard.lock().remove(self.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -58,10 +58,7 @@ impl fmt::Display for ConnInfo {
|
||||
write!(
|
||||
f,
|
||||
"{}@{}/{}?{}",
|
||||
self.user_info.user,
|
||||
self.user_info.endpoint,
|
||||
self.dbname,
|
||||
self.user_info.options.get_cache_key("")
|
||||
self.user_info.user, self.user_info.endpoint, self.dbname, self.user_info.options
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -716,7 +713,7 @@ impl<C: ClientInnerExt> Drop for Client<C> {
|
||||
mod tests {
|
||||
use std::{mem, sync::atomic::AtomicBool};
|
||||
|
||||
use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId};
|
||||
use crate::{BranchId, EndpointId, ProjectId};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -767,8 +764,6 @@ mod tests {
|
||||
max_total_conns: 3,
|
||||
},
|
||||
request_timeout: Duration::from_secs(1),
|
||||
cancel_set: CancelSet::new(0),
|
||||
client_conn_threshold: u64::MAX,
|
||||
}));
|
||||
let pool = GlobalConnPool::new(config);
|
||||
let conn_info = ConnInfo {
|
||||
|
||||
@@ -424,8 +424,8 @@ pub enum SqlOverHttpCancel {
|
||||
impl ReportableError for SqlOverHttpCancel {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect,
|
||||
SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect,
|
||||
SqlOverHttpCancel::Postgres => ErrorKind::RateLimit,
|
||||
SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ requests = "^2.31.0"
|
||||
pytest-xdist = "^3.3.1"
|
||||
asyncpg = "^0.29.0"
|
||||
aiopg = "^1.4.0"
|
||||
Jinja2 = "^3.1.4"
|
||||
Jinja2 = "^3.1.3"
|
||||
types-requests = "^2.31.0.0"
|
||||
types-psycopg2 = "^2.9.21.10"
|
||||
boto3 = "^1.34.11"
|
||||
@@ -24,7 +24,7 @@ backoff = "^2.2.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "^3.0.3"
|
||||
Werkzeug = "^3.0.1"
|
||||
pytest-order = "^1.1.0"
|
||||
allure-pytest = "^2.13.2"
|
||||
pytest-asyncio = "^0.21.0"
|
||||
|
||||
@@ -14,7 +14,7 @@ import textwrap
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import ExitStack, closing, contextmanager
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
@@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import (
|
||||
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name
|
||||
from fixtures.pageserver.types import IndexPartDump
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
@@ -68,7 +68,7 @@ from fixtures.remote_storage import (
|
||||
RemoteStorageUser,
|
||||
S3Storage,
|
||||
default_remote_storage,
|
||||
remote_storage_to_toml_dict,
|
||||
remote_storage_to_toml_inline_table,
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.utils import are_walreceivers_absent
|
||||
@@ -82,7 +82,6 @@ from fixtures.utils import (
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.utils import AuxFileStore as AuxFileStore # reexport
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
@@ -466,7 +465,6 @@ class NeonEnvBuilder:
|
||||
initial_tenant: Optional[TenantId] = None,
|
||||
initial_timeline: Optional[TimelineId] = None,
|
||||
pageserver_virtual_file_io_engine: Optional[str] = None,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
|
||||
):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
@@ -490,7 +488,6 @@ class NeonEnvBuilder:
|
||||
self.env: Optional[NeonEnv] = None
|
||||
self.keep_remote_storage_contents: bool = True
|
||||
self.neon_binpath = neon_binpath
|
||||
self.neon_local_binpath = neon_binpath
|
||||
self.pg_distrib_dir = pg_distrib_dir
|
||||
self.pg_version = pg_version
|
||||
self.preserve_database_files = preserve_database_files
|
||||
@@ -522,8 +519,6 @@ class NeonEnvBuilder:
|
||||
self.pageserver_validate_vectored_get = bool(validate)
|
||||
log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
|
||||
|
||||
self.pageserver_aux_file_policy = pageserver_aux_file_policy
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
), "Unexpectedly instantiated from outside a test function"
|
||||
@@ -569,7 +564,6 @@ class NeonEnvBuilder:
|
||||
timeline_id=env.initial_timeline,
|
||||
shard_count=initial_tenant_shard_count,
|
||||
shard_stripe_size=initial_tenant_shard_stripe_size,
|
||||
aux_file_v2=self.pageserver_aux_file_policy,
|
||||
)
|
||||
assert env.initial_tenant == initial_tenant
|
||||
assert env.initial_timeline == initial_timeline
|
||||
@@ -638,11 +632,17 @@ class NeonEnvBuilder:
|
||||
def from_repo_dir(
|
||||
self,
|
||||
repo_dir: Path,
|
||||
neon_binpath: Optional[Path] = None,
|
||||
pg_distrib_dir: Optional[Path] = None,
|
||||
) -> NeonEnv:
|
||||
"""
|
||||
A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir.
|
||||
"""
|
||||
|
||||
# Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests
|
||||
self.neon_binpath = neon_binpath or self.neon_binpath
|
||||
self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir
|
||||
|
||||
# Get the initial tenant and timeline from the snapshot config
|
||||
snapshot_config_toml = repo_dir / "config"
|
||||
with snapshot_config_toml.open("r") as f:
|
||||
@@ -982,7 +982,7 @@ class NeonEnv:
|
||||
|
||||
Some notable functions and fields in NeonEnv:
|
||||
|
||||
endpoints - A factory object for creating postgres compute nodes.
|
||||
postgres - A factory object for creating postgres compute nodes.
|
||||
|
||||
pageservers - An array containing objects representing the pageservers
|
||||
|
||||
@@ -1017,10 +1017,9 @@ class NeonEnv:
|
||||
self.pg_version = config.pg_version
|
||||
# Binary path for pageserver, safekeeper, etc
|
||||
self.neon_binpath = config.neon_binpath
|
||||
# Binary path for neon_local test-specific binaries
|
||||
self.neon_local_binpath = config.neon_local_binpath
|
||||
if self.neon_local_binpath is None:
|
||||
self.neon_local_binpath = self.neon_binpath
|
||||
# Binary path for neon_local test-specific binaries: may be overridden
|
||||
# after construction for compat testing
|
||||
self.neon_local_binpath = config.neon_binpath
|
||||
self.pg_distrib_dir = config.pg_distrib_dir
|
||||
self.endpoint_counter = 0
|
||||
self.storage_controller_config = config.storage_controller_config
|
||||
@@ -1052,7 +1051,6 @@ class NeonEnv:
|
||||
)
|
||||
|
||||
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
|
||||
self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
|
||||
|
||||
# Create a config file corresponding to the options
|
||||
cfg: Dict[str, Any] = {
|
||||
@@ -1289,7 +1287,6 @@ def _shared_simple_env(
|
||||
pg_distrib_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore],
|
||||
) -> Iterator[NeonEnv]:
|
||||
"""
|
||||
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
|
||||
@@ -1320,7 +1317,6 @@ def _shared_simple_env(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
) as builder:
|
||||
env = builder.init_start()
|
||||
|
||||
@@ -1360,7 +1356,6 @@ def neon_env_builder(
|
||||
test_overlay_dir: Path,
|
||||
top_output_dir: Path,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
@@ -1394,7 +1389,6 @@ def neon_env_builder(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
test_overlay_dir=test_overlay_dir,
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
) as builder:
|
||||
yield builder
|
||||
|
||||
@@ -1554,7 +1548,6 @@ class NeonCli(AbstractNeonCli):
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
placement_policy: Optional[str] = None,
|
||||
set_default: bool = False,
|
||||
aux_file_v2: Optional[AuxFileStore] = None,
|
||||
) -> Tuple[TenantId, TimelineId]:
|
||||
"""
|
||||
Creates a new tenant, returns its id and its initial timeline's id.
|
||||
@@ -1578,16 +1571,6 @@ class NeonCli(AbstractNeonCli):
|
||||
product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
|
||||
)
|
||||
)
|
||||
|
||||
if aux_file_v2 is AuxFileStore.V2:
|
||||
args.extend(["-c", "switch_aux_file_policy:v2"])
|
||||
|
||||
if aux_file_v2 is AuxFileStore.V1:
|
||||
args.extend(["-c", "switch_aux_file_policy:v1"])
|
||||
|
||||
if aux_file_v2 is AuxFileStore.CrossValidation:
|
||||
args.extend(["-c", "switch_aux_file_policy:cross_validation"])
|
||||
|
||||
if set_default:
|
||||
args.append("--set-default")
|
||||
|
||||
@@ -1726,44 +1709,36 @@ class NeonCli(AbstractNeonCli):
|
||||
force: Optional[str] = None,
|
||||
pageserver_config_override: Optional[str] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
remote_storage = self.env.pageserver_remote_storage
|
||||
with tempfile.NamedTemporaryFile(mode="w+") as tmp:
|
||||
tmp.write(toml.dumps(config))
|
||||
tmp.flush()
|
||||
|
||||
ps_config = {}
|
||||
if remote_storage is not None:
|
||||
ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage)
|
||||
|
||||
if pageserver_config_override is not None:
|
||||
for o in pageserver_config_override.split(";"):
|
||||
override = toml.loads(o)
|
||||
for key, value in override.items():
|
||||
ps_config[key] = value
|
||||
|
||||
with ExitStack() as stack:
|
||||
ps_config_file = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
|
||||
ps_config_file.write(toml.dumps(ps_config))
|
||||
ps_config_file.flush()
|
||||
|
||||
neon_local_config = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
|
||||
neon_local_config.write(toml.dumps(config))
|
||||
neon_local_config.flush()
|
||||
|
||||
cmd = [
|
||||
"init",
|
||||
f"--config={neon_local_config.name}",
|
||||
"--pg-version",
|
||||
self.env.pg_version,
|
||||
f"--pageserver-config={ps_config_file.name}",
|
||||
]
|
||||
cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
|
||||
|
||||
if force is not None:
|
||||
cmd.extend(["--force", force])
|
||||
|
||||
remote_storage = self.env.pageserver_remote_storage
|
||||
|
||||
if remote_storage is not None:
|
||||
remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
|
||||
|
||||
cmd.append(
|
||||
f"--pageserver-config-override=remote_storage={remote_storage_toml_table}"
|
||||
)
|
||||
|
||||
if pageserver_config_override is not None:
|
||||
cmd += [
|
||||
f"--pageserver-config-override={o.strip()}"
|
||||
for o in pageserver_config_override.split(";")
|
||||
]
|
||||
|
||||
s3_env_vars = None
|
||||
if isinstance(remote_storage, S3Storage):
|
||||
s3_env_vars = remote_storage.access_env_vars()
|
||||
res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
|
||||
res.check_returncode()
|
||||
return res
|
||||
return res
|
||||
|
||||
def storage_controller_start(self):
|
||||
cmd = ["storage_controller", "start"]
|
||||
@@ -1778,9 +1753,10 @@ class NeonCli(AbstractNeonCli):
|
||||
def pageserver_start(
|
||||
self,
|
||||
id: int,
|
||||
overrides: Tuple[str, ...] = (),
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
start_args = ["pageserver", "start", f"--id={id}"]
|
||||
start_args = ["pageserver", "start", f"--id={id}", *overrides]
|
||||
storage = self.env.pageserver_remote_storage
|
||||
|
||||
if isinstance(storage, S3Storage):
|
||||
@@ -2445,42 +2421,9 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
return self.workdir / "tenants"
|
||||
return self.workdir / "tenants" / str(tenant_shard_id)
|
||||
|
||||
@property
|
||||
def config_toml_path(self) -> Path:
|
||||
return self.workdir / "pageserver.toml"
|
||||
|
||||
def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
|
||||
"""
|
||||
Edit the pageserver's config toml file in place.
|
||||
"""
|
||||
path = self.config_toml_path
|
||||
with open(path, "r") as f:
|
||||
config = toml.load(f)
|
||||
edit_fn(config)
|
||||
with open(path, "w") as f:
|
||||
toml.dump(config, f)
|
||||
|
||||
def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Non-recursively merge the given `patch` dict into the existing config toml, using `dict.update()`.
|
||||
Returns the replaced values.
|
||||
If there was no previous value, the key is mapped to None.
|
||||
This allows to restore the original value by calling this method with the returned dict.
|
||||
"""
|
||||
replacements = {}
|
||||
|
||||
def doit(config: Dict[str, Any]):
|
||||
while len(patch) > 0:
|
||||
key, new = patch.popitem()
|
||||
old = config.get(key, None)
|
||||
config[key] = new
|
||||
replacements[key] = old
|
||||
|
||||
self.edit_config_toml(doit)
|
||||
return replacements
|
||||
|
||||
def start(
|
||||
self,
|
||||
overrides: Tuple[str, ...] = (),
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
) -> "NeonPageserver":
|
||||
"""
|
||||
@@ -2490,7 +2433,9 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
"""
|
||||
assert self.running is False
|
||||
|
||||
self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
|
||||
self.env.neon_cli.pageserver_start(
|
||||
self.id, overrides=overrides, extra_env_vars=extra_env_vars
|
||||
)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2652,37 +2597,6 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
)
|
||||
|
||||
def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
|
||||
"""
|
||||
Inspect local storage on a pageserver to discover which layer files are present.
|
||||
|
||||
:return: list of relative paths to layers, from the timeline root.
|
||||
"""
|
||||
timeline_path = self.timeline_dir(tenant_id, timeline_id)
|
||||
|
||||
def relative(p: Path) -> Path:
|
||||
return p.relative_to(timeline_path)
|
||||
|
||||
return sorted(
|
||||
list(
|
||||
map(
|
||||
relative,
|
||||
filter(
|
||||
lambda path: path.name != "metadata"
|
||||
and "ephemeral" not in path.name
|
||||
and "temp" not in path.name,
|
||||
timeline_path.glob("*"),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def layer_exists(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName
|
||||
) -> bool:
|
||||
layers = self.list_layers(tenant_id, timeline_id)
|
||||
return layer_name in [parse_layer_file_name(p.name) for p in layers]
|
||||
|
||||
|
||||
class PgBin:
|
||||
"""A helper class for executing postgres binaries"""
|
||||
|
||||
@@ -819,23 +819,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
continue
|
||||
self.download_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
def detach_ancestor(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
batch_size: int | None = None,
|
||||
) -> Set[TimelineId]:
|
||||
params = {}
|
||||
if batch_size is not None:
|
||||
params["batch_size"] = batch_size
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
|
||||
params=params,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
json = res.json()
|
||||
return set(map(TimelineId, json["reparented_timelines"]))
|
||||
|
||||
def evict_layer(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
|
||||
):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Tuple, Union
|
||||
|
||||
@@ -48,36 +47,46 @@ class InvalidFileName(Exception):
|
||||
pass
|
||||
|
||||
|
||||
IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$")
|
||||
|
||||
|
||||
def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
|
||||
"""Parse an image layer file name. Return key start, key end, and snapshot lsn"""
|
||||
|
||||
match = IMAGE_LAYER_FILE_NAME.match(f_name)
|
||||
if match is None:
|
||||
raise InvalidFileName(f"'{f_name}' is not an image layer filename")
|
||||
|
||||
return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16)
|
||||
|
||||
|
||||
DELTA_LAYER_FILE_NAME = re.compile(
|
||||
"^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$"
|
||||
)
|
||||
parts = f_name.split("__")
|
||||
if len(parts) != 2:
|
||||
raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
|
||||
key_parts = parts[0].split("-")
|
||||
if len(key_parts) != 2:
|
||||
raise InvalidFileName(
|
||||
f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
|
||||
)
|
||||
try:
|
||||
return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16)
|
||||
except ValueError as e:
|
||||
raise InvalidFileName(f"conversion error: {f_name}") from e
|
||||
|
||||
|
||||
def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
|
||||
"""Parse a delta layer file name. Return key start, key end, lsn start, and lsn end"""
|
||||
match = DELTA_LAYER_FILE_NAME.match(f_name)
|
||||
if match is None:
|
||||
raise InvalidFileName(f"'{f_name}' is not an delta layer filename")
|
||||
|
||||
return (
|
||||
int(match.group(1), 16),
|
||||
int(match.group(2), 16),
|
||||
int(match.group(3), 16),
|
||||
int(match.group(4), 16),
|
||||
)
|
||||
parts = f_name.split("__")
|
||||
if len(parts) != 2:
|
||||
raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
|
||||
key_parts = parts[0].split("-")
|
||||
if len(key_parts) != 2:
|
||||
raise InvalidFileName(
|
||||
f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
|
||||
)
|
||||
lsn_parts = parts[1].split("-")
|
||||
if len(lsn_parts) != 2:
|
||||
raise InvalidFileName(
|
||||
f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}"
|
||||
)
|
||||
try:
|
||||
return (
|
||||
int(key_parts[0], 16),
|
||||
int(key_parts[1], 16),
|
||||
int(lsn_parts[0], 16),
|
||||
int(lsn_parts[1], 16),
|
||||
)
|
||||
except ValueError as e:
|
||||
raise InvalidFileName(f"conversion error: {f_name}") from e
|
||||
|
||||
|
||||
def parse_layer_file_name(file_name: str) -> LayerFileName:
|
||||
|
||||
@@ -5,7 +5,6 @@ import pytest
|
||||
from _pytest.python import Metafunc
|
||||
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import AuxFileStore
|
||||
|
||||
"""
|
||||
Dynamically parametrize tests by different parameters
|
||||
@@ -32,11 +31,6 @@ def pageserver_virtual_file_io_engine() -> Optional[str]:
|
||||
return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
|
||||
return None
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc: Metafunc):
|
||||
if (bt := os.getenv("BUILD_TYPE")) is None:
|
||||
build_types = ["debug", "release"]
|
||||
|
||||
@@ -141,13 +141,11 @@ class LocalFsStorage:
|
||||
with self.heatmap_path(tenant_id).open("r") as f:
|
||||
return json.load(f)
|
||||
|
||||
def to_toml_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
def to_toml_inline_table(self) -> str:
|
||||
rv = {
|
||||
"local_path": str(self.root),
|
||||
}
|
||||
|
||||
def to_toml_inline_table(self) -> str:
|
||||
return toml.TomlEncoder().dump_inline_table(self.to_toml_dict())
|
||||
return toml.TomlEncoder().dump_inline_table(rv)
|
||||
|
||||
def cleanup(self):
|
||||
# no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files
|
||||
@@ -196,7 +194,7 @@ class S3Storage:
|
||||
}
|
||||
)
|
||||
|
||||
def to_toml_dict(self) -> Dict[str, Any]:
|
||||
def to_toml_inline_table(self) -> str:
|
||||
rv = {
|
||||
"bucket_name": self.bucket_name,
|
||||
"bucket_region": self.bucket_region,
|
||||
@@ -208,10 +206,7 @@ class S3Storage:
|
||||
if self.endpoint is not None:
|
||||
rv["endpoint"] = self.endpoint
|
||||
|
||||
return rv
|
||||
|
||||
def to_toml_inline_table(self) -> str:
|
||||
return toml.TomlEncoder().dump_inline_table(self.to_toml_dict())
|
||||
return toml.TomlEncoder().dump_inline_table(rv)
|
||||
|
||||
def do_cleanup(self):
|
||||
if not self.cleanup:
|
||||
@@ -419,13 +414,6 @@ def default_remote_storage() -> RemoteStorageKind:
|
||||
return RemoteStorageKind.LOCAL_FS
|
||||
|
||||
|
||||
def remote_storage_to_toml_dict(remote_storage: RemoteStorage) -> Dict[str, Any]:
|
||||
if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
|
||||
raise Exception("invalid remote storage type")
|
||||
|
||||
return remote_storage.to_toml_dict()
|
||||
|
||||
|
||||
# serialize as toml inline table
|
||||
def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
|
||||
if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import contextlib
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -485,16 +484,3 @@ def assert_no_errors(log_file, service, allowed_errors):
|
||||
log.info(f"not allowed {service} error: {error.strip()}")
|
||||
|
||||
assert not errors, f"Log errors on {service}: {errors[0]}"
|
||||
|
||||
|
||||
@enum.unique
|
||||
class AuxFileStore(str, enum.Enum):
|
||||
V1 = "V1"
|
||||
V2 = "V2"
|
||||
CrossValidation = "CrossValidation"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
@@ -140,14 +140,10 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
|
||||
|
||||
# start without gc so we can time compaction with less noise; use shorter
|
||||
# period for compaction so it starts earlier
|
||||
def patch_default_tenant_config(config):
|
||||
tenant_config = config.get("tenant_config", {})
|
||||
tenant_config["compaction_period"] = "3s"
|
||||
tenant_config["gc_period"] = "0s"
|
||||
config["tenant_config"] = tenant_config
|
||||
|
||||
env.pageserver.edit_config_toml(patch_default_tenant_config)
|
||||
env.pageserver.start(
|
||||
overrides=(
|
||||
"--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }",
|
||||
),
|
||||
# this does print more than we want, but the number should be comparable between runs
|
||||
extra_env_vars={
|
||||
"RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info"
|
||||
|
||||
@@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"trace_read_requests": True,
|
||||
"walreceiver_connect_timeout": "13m",
|
||||
"image_layer_creation_check_threshold": 1,
|
||||
"switch_aux_file_policy": "CrossValidation",
|
||||
"switch_to_aux_file_v2": True,
|
||||
}
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
@@ -233,18 +233,17 @@ def test_forward_compatibility(
|
||||
neon_env_builder.pageserver_validate_vectored_get = None
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
# Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
|
||||
# But always use the current version's neon_local binary.
|
||||
# This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI.
|
||||
neon_env_builder.neon_binpath = compatibility_neon_bin
|
||||
neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir
|
||||
neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath
|
||||
|
||||
neon_local_binpath = neon_env_builder.neon_binpath
|
||||
env = neon_env_builder.from_repo_dir(
|
||||
compatibility_snapshot_dir / "repo",
|
||||
neon_binpath=compatibility_neon_bin,
|
||||
pg_distrib_dir=compatibility_postgres_distrib_dir,
|
||||
)
|
||||
|
||||
# Use current neon_local even though we're using old binaries for
|
||||
# everything else: our test code is written for latest CLI args.
|
||||
env.neon_local_binpath = neon_local_binpath
|
||||
|
||||
neon_env_builder.start()
|
||||
|
||||
check_neon_works(
|
||||
|
||||
@@ -5,6 +5,7 @@ from dataclasses import dataclass
|
||||
from typing import Any, Dict, Iterable, Tuple
|
||||
|
||||
import pytest
|
||||
import toml
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
@@ -44,16 +45,17 @@ def test_min_resident_size_override_handling(
|
||||
ps_http.set_tenant_config(tenant_id, {})
|
||||
assert_config(tenant_id, None, default_tenant_conf_value)
|
||||
|
||||
if config_level_override is not None:
|
||||
|
||||
def set_min_resident_size(config):
|
||||
tenant_config = config.get("tenant_config", {})
|
||||
tenant_config["min_resident_size_override"] = config_level_override
|
||||
config["tenant_config"] = tenant_config
|
||||
|
||||
env.pageserver.edit_config_toml(set_min_resident_size)
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
if config_level_override is not None:
|
||||
env.pageserver.start(
|
||||
overrides=(
|
||||
"--pageserver-config-override=tenant_config={ min_resident_size_override = "
|
||||
+ str(config_level_override)
|
||||
+ " }",
|
||||
)
|
||||
)
|
||||
else:
|
||||
env.pageserver.start()
|
||||
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
assert_overrides(tenant_id, config_level_override)
|
||||
@@ -162,32 +164,34 @@ class EvictionEnv:
|
||||
usage eviction task is unknown; it might need to run one more iteration
|
||||
before assertions can be made.
|
||||
"""
|
||||
disk_usage_config = {
|
||||
"period": period,
|
||||
"max_usage_pct": max_usage_pct,
|
||||
"min_avail_bytes": min_avail_bytes,
|
||||
"mock_statvfs": mock_behavior,
|
||||
"eviction_order": eviction_order.config(),
|
||||
}
|
||||
|
||||
enc = toml.TomlEncoder()
|
||||
|
||||
# these can sometimes happen during startup before any tenants have been
|
||||
# loaded, so nothing can be evicted, we just wait for next iteration which
|
||||
# is able to evict.
|
||||
pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
|
||||
|
||||
pageserver.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"disk_usage_based_eviction": {
|
||||
"period": period,
|
||||
"max_usage_pct": max_usage_pct,
|
||||
"min_avail_bytes": min_avail_bytes,
|
||||
"mock_statvfs": mock_behavior,
|
||||
"eviction_order": eviction_order.config(),
|
||||
},
|
||||
pageserver.start(
|
||||
overrides=(
|
||||
"--pageserver-config-override=disk_usage_based_eviction="
|
||||
+ enc.dump_inline_table(disk_usage_config).replace("\n", " "),
|
||||
# Disk usage based eviction runs as a background task.
|
||||
# But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup.
|
||||
# But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages.
|
||||
# But, we only have a 10-second-timeout in this test.
|
||||
# So, disable the delay for this test.
|
||||
"background_task_maximum_delay": "0s",
|
||||
}
|
||||
"--pageserver-config-override=background_task_maximum_delay='0s'",
|
||||
),
|
||||
)
|
||||
|
||||
pageserver.start()
|
||||
|
||||
# we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
|
||||
for tenant_id, timeline_id in self.timelines:
|
||||
tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
|
||||
|
||||
@@ -2,7 +2,6 @@ import time
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload_queue_empty,
|
||||
@@ -87,7 +86,14 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
|
||||
# path = env.remote_storage.timeline_path(tenant_id, timeline_id)
|
||||
l1_found = None
|
||||
for path in env.pageserver.list_layers(tenant_id, timeline_id):
|
||||
for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir():
|
||||
if path.name == "metadata" or path.name.startswith("ephemeral-"):
|
||||
continue
|
||||
|
||||
if len(path.suffixes) > 0:
|
||||
# temp files
|
||||
continue
|
||||
|
||||
[key_range, lsn_range] = path.name.split("__", maxsplit=1)
|
||||
|
||||
if "-" not in lsn_range:
|
||||
@@ -102,21 +108,19 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
|
||||
if l1_found is not None:
|
||||
raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}")
|
||||
l1_found = parse_layer_file_name(path.name)
|
||||
l1_found = path
|
||||
|
||||
assert l1_found is not None, "failed to find L1 locally"
|
||||
|
||||
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, l1_found.to_str()
|
||||
tenant_id, timeline_id, l1_found.name
|
||||
)
|
||||
assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
|
||||
|
||||
env.pageserver.start()
|
||||
wait_until_tenant_active(pageserver_http, tenant_id)
|
||||
|
||||
assert not env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, l1_found
|
||||
), "partial compaction result should had been removed during startup"
|
||||
assert not l1_found.exists(), "partial compaction result should had been removed during startup"
|
||||
|
||||
# wait for us to catch up again
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
|
||||
@@ -126,18 +130,18 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
# give time for log flush
|
||||
time.sleep(1)
|
||||
|
||||
message = f".*duplicated L1 layer layer={l1_found}"
|
||||
message = f".*duplicated L1 layer layer={l1_found.name}"
|
||||
found_msg = env.pageserver.log_contains(message)
|
||||
# resident or evicted, it should not be overwritten, however it should had been non-existing at startup
|
||||
assert (
|
||||
found_msg is None
|
||||
), "layer should had been removed during startup, did it live on as evicted?"
|
||||
|
||||
assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"
|
||||
assert l1_found.exists(), "the L1 reappears"
|
||||
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
|
||||
|
||||
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, l1_found.to_str()
|
||||
tenant_id, timeline_id, l1_found.name
|
||||
)
|
||||
assert uploaded.exists(), "the L1 is uploaded"
|
||||
|
||||
@@ -7,7 +7,6 @@ from fixtures.neon_fixtures import (
|
||||
flush_ep_to_pageserver,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import wait_for_upload
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
|
||||
@@ -58,9 +57,9 @@ def test_basic_eviction(
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
initial_local_layers = dict(
|
||||
(parse_layer_file_name(path.name), path)
|
||||
for path in env.pageserver.list_layers(tenant_id, timeline_id)
|
||||
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
initial_local_layers = sorted(
|
||||
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
)
|
||||
assert (
|
||||
len(initial_local_layers) > 1
|
||||
@@ -74,7 +73,6 @@ def test_basic_eviction(
|
||||
assert len(initial_local_layers) == len(
|
||||
initial_layer_map_info.historic_layers
|
||||
), "Should have the same layers in memory and on disk"
|
||||
|
||||
for returned_layer in initial_layer_map_info.historic_layers:
|
||||
assert (
|
||||
returned_layer.kind == "Delta"
|
||||
@@ -83,29 +81,27 @@ def test_basic_eviction(
|
||||
not returned_layer.remote
|
||||
), f"All created layers should be present locally, but got {returned_layer}"
|
||||
|
||||
returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name)
|
||||
assert (
|
||||
returned_layer_name in initial_local_layers
|
||||
), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}"
|
||||
|
||||
local_layer_path = (
|
||||
env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
/ initial_local_layers[returned_layer_name]
|
||||
local_layers = list(
|
||||
filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers)
|
||||
)
|
||||
assert (
|
||||
returned_layer.layer_file_size == local_layer_path.stat().st_size
|
||||
), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}"
|
||||
len(local_layers) == 1
|
||||
), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
|
||||
local_layer = local_layers[0]
|
||||
assert (
|
||||
returned_layer.layer_file_size == local_layer.stat().st_size
|
||||
), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}"
|
||||
|
||||
# Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map
|
||||
for local_layer_name, local_layer_path in initial_local_layers.items():
|
||||
for local_layer in initial_local_layers:
|
||||
client.evict_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
|
||||
)
|
||||
assert not env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, local_layer_name
|
||||
), f"Did not expect to find {local_layer_name} layer after evicting"
|
||||
assert not any(
|
||||
new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*")
|
||||
), f"Did not expect to find {local_layer} layer after evicting"
|
||||
|
||||
empty_layers = env.pageserver.list_layers(tenant_id, timeline_id)
|
||||
empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
|
||||
|
||||
evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
|
||||
@@ -122,15 +118,15 @@ def test_basic_eviction(
|
||||
assert (
|
||||
returned_layer.remote
|
||||
), f"All layers should be evicted and not present locally, but got {returned_layer}"
|
||||
returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name)
|
||||
assert (
|
||||
returned_layer_name in initial_local_layers
|
||||
assert any(
|
||||
local_layer.name == returned_layer.layer_file_name
|
||||
for local_layer in initial_local_layers
|
||||
), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
|
||||
|
||||
# redownload all evicted layers and ensure the initial state is restored
|
||||
for local_layer_name, _local_layer_path in initial_local_layers.items():
|
||||
for local_layer in initial_local_layers:
|
||||
client.download_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str()
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
|
||||
)
|
||||
client.timeline_download_remote_layers(
|
||||
tenant_id,
|
||||
@@ -141,9 +137,8 @@ def test_basic_eviction(
|
||||
at_least_one_download=False,
|
||||
)
|
||||
|
||||
redownloaded_layers = dict(
|
||||
(parse_layer_file_name(path.name), path)
|
||||
for path in env.pageserver.list_layers(tenant_id, timeline_id)
|
||||
redownloaded_layers = sorted(
|
||||
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
)
|
||||
assert (
|
||||
redownloaded_layers == initial_local_layers
|
||||
|
||||
@@ -6,7 +6,6 @@ from string import ascii_lowercase
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
AuxFileStore,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
logical_replication_sync,
|
||||
@@ -20,19 +19,6 @@ def random_string(n: int):
|
||||
return "".join([choice(ascii_lowercase) for _ in range(n)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
|
||||
env = neon_simple_env
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(env.initial_tenant).effective_config
|
||||
assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -174,9 +160,6 @@ COMMIT;
|
||||
|
||||
|
||||
# Test that neon.logical_replication_max_snap_files works
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
def slot_removed(ep):
|
||||
assert (
|
||||
@@ -298,9 +281,6 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
|
||||
|
||||
# Test compute start at LSN page of which starts with contrecord
|
||||
# https://github.com/neondatabase/neon/issues/5749
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -391,9 +371,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
# logical replication bug as such, but without logical replication,
|
||||
# records passed ot the WAL redo process are never large enough to hit
|
||||
# the bug.
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -465,9 +442,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
|
||||
ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_replication_shutdown(neon_simple_env: NeonEnv):
|
||||
# Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -10,7 +10,6 @@ of the pageserver are:
|
||||
"""
|
||||
|
||||
import enum
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
@@ -221,12 +220,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
# We will start a pageserver with no control_plane_api set, so it won't be able to self-register
|
||||
env.storage_controller.node_register(env.pageserver)
|
||||
|
||||
replaced_config = env.pageserver.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"control_plane_api": "",
|
||||
}
|
||||
)
|
||||
env.pageserver.start()
|
||||
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
|
||||
env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
@@ -257,8 +251,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
assert parse_generation_suffix(key) is None
|
||||
|
||||
env.pageserver.stop()
|
||||
|
||||
# Starting without the override that disabled control_plane_api
|
||||
env.pageserver.patch_config_toml_nonrecursive(replaced_config)
|
||||
env.pageserver.start()
|
||||
|
||||
generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
|
||||
@@ -531,12 +525,9 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
# incident, but it might be unavoidable: if so, we want to be able to start up
|
||||
# and serve clients.
|
||||
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
|
||||
replaced = env.pageserver.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"control_plane_emergency_mode": True,
|
||||
}
|
||||
env.pageserver.start(
|
||||
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
|
||||
)
|
||||
env.pageserver.start()
|
||||
|
||||
# The pageserver should provide service to clients
|
||||
generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
|
||||
@@ -558,7 +549,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
# The pageserver should work fine when subsequently restarted in non-emergency mode
|
||||
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
|
||||
env.pageserver.patch_config_toml_nonrecursive(replaced)
|
||||
env.pageserver.start()
|
||||
|
||||
generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
|
||||
@@ -701,50 +691,3 @@ def test_multi_attach(
|
||||
|
||||
# All data we wrote while multi-attached remains readable
|
||||
workload.validate(pageservers[2].id)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="To be enabled after release with new local path style")
|
||||
def test_upgrade_generationless_local_file_paths(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test pageserver behavior when startup up with local layer paths without
|
||||
generation numbers: it should accept these layer files, and avoid doing
|
||||
a delete/download cycle on them.
|
||||
"""
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(1000)
|
||||
|
||||
env.pageserver.stop()
|
||||
|
||||
# Rename the local paths to legacy format, to simulate what
|
||||
# we would see when upgrading
|
||||
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
files_renamed = 0
|
||||
for filename in os.listdir(timeline_dir):
|
||||
path = os.path.join(timeline_dir, filename)
|
||||
log.info(f"Found file {path}")
|
||||
if path.endswith("-00000001"):
|
||||
new_path = path[:-9]
|
||||
os.rename(path, new_path)
|
||||
log.info(f"Renamed {path} -> {new_path}")
|
||||
files_renamed += 1
|
||||
|
||||
assert files_renamed > 0
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
workload.validate()
|
||||
|
||||
# Assert that there were no on-demand downloads
|
||||
assert (
|
||||
env.pageserver.http_client().get_metric_value(
|
||||
"pageserver_remote_ondemand_downloaded_layers_total"
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
@@ -2,12 +2,12 @@ import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
@@ -51,13 +51,9 @@ def evict_random_layers(
|
||||
if "ephemeral" in layer.name or "temp_download" in layer.name:
|
||||
continue
|
||||
|
||||
layer_name = parse_layer_file_name(layer.name)
|
||||
|
||||
if rng.choice([True, False]):
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}")
|
||||
client.evict_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str()
|
||||
)
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
|
||||
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", [1, 2, 3])
|
||||
@@ -406,6 +402,32 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
validate_heatmap(heatmap_second)
|
||||
|
||||
|
||||
def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
|
||||
"""
|
||||
Inspect local storage on a pageserver to discover which layer files are present.
|
||||
|
||||
:return: list of relative paths to layers, from the timeline root.
|
||||
"""
|
||||
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
|
||||
def relative(p: Path) -> Path:
|
||||
return p.relative_to(timeline_path)
|
||||
|
||||
return sorted(
|
||||
list(
|
||||
map(
|
||||
relative,
|
||||
filter(
|
||||
lambda path: path.name != "metadata"
|
||||
and "ephemeral" not in path.name
|
||||
and "temp" not in path.name,
|
||||
timeline_path.glob("*"),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the overall data flow in secondary mode:
|
||||
@@ -460,8 +482,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
# Make changes on attached pageserver, check secondary downloads them
|
||||
@@ -478,8 +500,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
try:
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
except:
|
||||
# Do a full listing of the secondary location on errors, to help debug of
|
||||
@@ -501,8 +523,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# ==================================================================
|
||||
try:
|
||||
log.info("Evicting a layer...")
|
||||
layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
|
||||
some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
|
||||
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
|
||||
some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
|
||||
log.info(f"Victim layer: {layer_to_evict.name}")
|
||||
ps_attached.http_client().evict_layer(
|
||||
tenant_id, timeline_id, layer_name=layer_to_evict.name
|
||||
@@ -515,13 +537,13 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
|
||||
)
|
||||
assert layer_to_evict.name not in heatmap_layers
|
||||
assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers
|
||||
assert some_other_layer.name in heatmap_layers
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
except:
|
||||
# On assertion failures, log some details to help with debugging
|
||||
@@ -608,7 +630,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
for timeline_id in timelines:
|
||||
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
|
||||
# One or more layers should be present for all timelines
|
||||
assert ps_secondary.list_layers(tenant_id, timeline_id)
|
||||
assert list_layers(ps_secondary, tenant_id, timeline_id)
|
||||
|
||||
# Delete the second timeline: this should be reflected later on the secondary
|
||||
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
|
||||
@@ -623,10 +645,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
|
||||
|
||||
# This one was not deleted
|
||||
assert ps_secondary.list_layers(tenant_id, timelines[0])
|
||||
assert list_layers(ps_secondary, tenant_id, timelines[0])
|
||||
|
||||
# This one was deleted
|
||||
assert not ps_secondary.list_layers(tenant_id, timelines[1])
|
||||
assert not list_layers(ps_secondary, tenant_id, timelines[1])
|
||||
|
||||
t_end = time.time()
|
||||
|
||||
@@ -686,7 +708,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
|
||||
ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Expect lots of layers
|
||||
assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
|
||||
assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10
|
||||
|
||||
# Simulate large data by making layer downloads artifically slow
|
||||
for ps in env.pageservers:
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from fixtures.neon_fixtures import PgBin
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
|
||||
"""Test that Postgres version matches the one we expect"""
|
||||
|
||||
with (base_dir / "vendor" / "revisions.json").open() as f:
|
||||
expected_revisions = json.load(f)
|
||||
|
||||
output_prefix = pg_bin.run_capture(["postgres", "--version"], with_command_header=False)
|
||||
stdout = Path(f"{output_prefix}.stdout")
|
||||
assert stdout.exists(), "postgres --version didn't print anything to stdout"
|
||||
|
||||
with stdout.open() as f:
|
||||
output = f.read().strip()
|
||||
|
||||
# `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)".
|
||||
pattern = r"postgres \(PostgreSQL\) (?P<version>\d+\.\d+) \((?P<commit>[0-9a-f]{40})\)"
|
||||
match = re.search(pattern, output, re.IGNORECASE)
|
||||
assert match is not None, f"Can't parse {output} with {pattern}"
|
||||
|
||||
version = match.group("version")
|
||||
commit = match.group("commit")
|
||||
|
||||
assert (
|
||||
pg_version.v_prefixed in expected_revisions
|
||||
), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
|
||||
|
||||
msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
|
||||
assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
|
||||
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_last_record_lsn,
|
||||
@@ -830,9 +829,8 @@ def test_compaction_waits_for_upload(
|
||||
assert len(upload_stuck_layers) > 0
|
||||
|
||||
for name in upload_stuck_layers:
|
||||
assert env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
), "while uploads are stuck the layers should be present on disk"
|
||||
path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
|
||||
assert path.exists(), "while uploads are stuck the layers should be present on disk"
|
||||
|
||||
# now this will do the L0 => L1 compaction and want to remove
|
||||
# upload_stuck_layers and the original initdb L0
|
||||
@@ -840,9 +838,8 @@ def test_compaction_waits_for_upload(
|
||||
|
||||
# as uploads are paused, the upload_stuck_layers should still be with us
|
||||
for name in upload_stuck_layers:
|
||||
assert env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
), "uploads are stuck still over compaction"
|
||||
path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
|
||||
assert path.exists(), "uploads are stuck still over compaction"
|
||||
|
||||
compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
|
||||
overlap = compacted_layers.intersection(upload_stuck_layers)
|
||||
@@ -876,8 +873,9 @@ def test_compaction_waits_for_upload(
|
||||
wait_until(10, 1, until_layer_deletes_completed)
|
||||
|
||||
for name in upload_stuck_layers:
|
||||
assert not env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
|
||||
assert (
|
||||
not path.exists()
|
||||
), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
|
||||
|
||||
# We should not have hit the error handling path in uploads where a uploaded file is gone
|
||||
|
||||
@@ -177,67 +177,6 @@ def test_sharding_split_unsharded(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that after a split, we clean up parent layer data in the child shards via compaction.
|
||||
"""
|
||||
TENANT_CONF = {
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
"pitr_interval": "3600s",
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly, so that GC can remove some layers
|
||||
"image_creation_threshold": "1",
|
||||
"image_layer_creation_check_threshold": "0",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Check that we created with an unsharded TenantShardId: this is the default,
|
||||
# but check it in case we change the default in future
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id, branch_name="main")
|
||||
workload.init()
|
||||
workload.write_rows(256)
|
||||
workload.validate()
|
||||
workload.stop()
|
||||
|
||||
# Split one shard into two
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
|
||||
|
||||
# Check we got the shard IDs we expected
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
|
||||
|
||||
workload.validate()
|
||||
workload.stop()
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
# Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
|
||||
# Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes
|
||||
detail_before = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
ps.http_client().timeline_compact(shard, timeline_id)
|
||||
detail_after = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
|
||||
# Physical size should shrink because some layers have been dropped
|
||||
assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
|
||||
|
||||
# Compaction shouldn't make anything unreadable
|
||||
workload.validate()
|
||||
|
||||
|
||||
def test_sharding_split_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
|
||||
@@ -290,12 +290,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
# This is the pageserver where we'll initially create the tenant. Run it in emergency
|
||||
# mode so that it doesn't talk to storage controller, and do not register it.
|
||||
env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
|
||||
env.pageservers[0].patch_config_toml_nonrecursive(
|
||||
{
|
||||
"control_plane_emergency_mode": True,
|
||||
}
|
||||
env.pageservers[0].start(
|
||||
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
|
||||
)
|
||||
env.pageservers[0].start()
|
||||
origin_ps = env.pageservers[0]
|
||||
|
||||
# These are the pageservers managed by the sharding service, where the tenant
|
||||
|
||||
@@ -18,7 +18,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
wait_for_last_record_lsn,
|
||||
@@ -247,10 +246,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
|
||||
# ensure the same size is found from the index_part.json
|
||||
index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id)
|
||||
assert (
|
||||
index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"]
|
||||
== expected_size
|
||||
)
|
||||
assert index_part["layer_metadata"][path.name]["file_size"] == expected_size
|
||||
|
||||
## Start the pageserver. It will notice that the file size doesn't match, and
|
||||
## rename away the local file. It will be re-downloaded when it's needed.
|
||||
@@ -280,7 +276,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
|
||||
# the remote side of local_layer_truncated
|
||||
remote_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, parse_layer_file_name(path.name).to_str()
|
||||
tenant_id, timeline_id, path.name
|
||||
)
|
||||
|
||||
# if the upload ever was ongoing, this check would be racy, but at least one
|
||||
|
||||
@@ -1,403 +0,0 @@
|
||||
import enum
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from queue import Empty, Queue
|
||||
from threading import Barrier
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import HistoricLayerInfo
|
||||
from fixtures.pageserver.utils import wait_timeline_detail_404
|
||||
from fixtures.types import Lsn, TimelineId
|
||||
|
||||
|
||||
def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
|
||||
assert info.lsn_end is not None
|
||||
return Lsn(info.lsn_end)
|
||||
|
||||
|
||||
def layer_name(info: HistoricLayerInfo) -> str:
|
||||
return info.layer_file_name
|
||||
|
||||
|
||||
@enum.unique
|
||||
class Branchpoint(str, enum.Enum):
|
||||
"""
|
||||
Have branches at these Lsns possibly relative to L0 layer boundary.
|
||||
"""
|
||||
|
||||
EARLIER = "earlier"
|
||||
AT_L0 = "at"
|
||||
AFTER_L0 = "after"
|
||||
LAST_RECORD_LSN = "head"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.value
|
||||
|
||||
@staticmethod
|
||||
def all() -> List["Branchpoint"]:
|
||||
return [
|
||||
Branchpoint.EARLIER,
|
||||
Branchpoint.AT_L0,
|
||||
Branchpoint.AFTER_L0,
|
||||
Branchpoint.LAST_RECORD_LSN,
|
||||
]
|
||||
|
||||
|
||||
SHUTDOWN_ALLOWED_ERRORS = [
|
||||
".*initial size calculation failed: downloading failed, possibly for shutdown",
|
||||
".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("branchpoint", Branchpoint.all())
|
||||
@pytest.mark.parametrize("restart_after", [True, False])
|
||||
def test_ancestor_detach_branched_from(
|
||||
neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool
|
||||
):
|
||||
"""
|
||||
Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached.
|
||||
"""
|
||||
# TODO: parametrize; currently unimplemented over at pageserver
|
||||
write_to_branch_first = True
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
ep.safe_psql("CREATE TABLE foo (i BIGINT);")
|
||||
|
||||
after_first_tx = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
|
||||
|
||||
# create a single layer for us to remote copy
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers()
|
||||
# there is also the in-mem layer, but ignore it for now
|
||||
assert len(deltas) == 2, "expecting there to be two deltas: initdb and checkpointed"
|
||||
later_delta = max(deltas, key=by_end_lsn)
|
||||
assert later_delta.lsn_end is not None
|
||||
|
||||
# -1 as the lsn_end is exclusive.
|
||||
last_lsn = Lsn(later_delta.lsn_end).lsn_int - 1
|
||||
|
||||
if branchpoint == Branchpoint.EARLIER:
|
||||
branch_at = after_first_tx
|
||||
rows = 0
|
||||
truncated_layers = 1
|
||||
elif branchpoint == Branchpoint.AT_L0:
|
||||
branch_at = Lsn(last_lsn)
|
||||
rows = 8192
|
||||
truncated_layers = 0
|
||||
elif branchpoint == Branchpoint.AFTER_L0:
|
||||
branch_at = Lsn(last_lsn + 8)
|
||||
rows = 8192
|
||||
# as there is no 8 byte walrecord, nothing should get copied from the straddling layer
|
||||
truncated_layers = 0
|
||||
else:
|
||||
# this case also covers the implicit flush of ancestor as the inmemory hasn't been flushed yet
|
||||
assert branchpoint == Branchpoint.LAST_RECORD_LSN
|
||||
branch_at = None
|
||||
rows = 16384
|
||||
truncated_layers = 0
|
||||
|
||||
name = "new main"
|
||||
|
||||
timeline_id = env.neon_cli.create_branch(
|
||||
name, "main", env.initial_tenant, ancestor_start_lsn=branch_at
|
||||
)
|
||||
|
||||
recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"])
|
||||
if branch_at is None:
|
||||
# fix it up if we need it later (currently unused)
|
||||
branch_at = recorded
|
||||
else:
|
||||
assert branch_at == recorded, "the test should not use unaligned lsns"
|
||||
|
||||
if write_to_branch_first:
|
||||
with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
# make sure the ep is writable
|
||||
# with BEFORE_L0, AFTER_L0 there will be a gap in Lsns caused by accurate end_lsn on straddling layers
|
||||
ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;")
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
|
||||
|
||||
# branch must have a flush for "PREV_LSN: none"
|
||||
client.timeline_checkpoint(env.initial_tenant, timeline_id)
|
||||
branch_layers = set(
|
||||
map(layer_name, client.layer_map_info(env.initial_tenant, timeline_id).historic_layers)
|
||||
)
|
||||
else:
|
||||
branch_layers = set()
|
||||
|
||||
all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
|
||||
assert all_reparented == set()
|
||||
|
||||
if restart_after:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 16384
|
||||
|
||||
with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
|
||||
old_main_info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
|
||||
old_main = set(map(layer_name, old_main_info.historic_layers))
|
||||
|
||||
new_main_info = client.layer_map_info(env.initial_tenant, timeline_id)
|
||||
new_main = set(map(layer_name, new_main_info.historic_layers))
|
||||
|
||||
new_main_copied_or_truncated = new_main - branch_layers
|
||||
new_main_truncated = new_main_copied_or_truncated - old_main
|
||||
|
||||
assert len(new_main_truncated) == truncated_layers
|
||||
# could additionally check that the symmetric difference has layers starting at the same lsn
|
||||
# but if nothing was copied, then there is no nice rule.
|
||||
# there could be a hole in LSNs between copied from the "old main" and the first branch layer.
|
||||
|
||||
client.timeline_delete(env.initial_tenant, env.initial_timeline)
|
||||
wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("restart_after", [True, False])
|
||||
def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool):
|
||||
"""
|
||||
The case from RFC:
|
||||
|
||||
+-> another branch with same ancestor_lsn as new main
|
||||
|
|
||||
old main -------|---------X--------->
|
||||
| | |
|
||||
| | +-> after
|
||||
| |
|
||||
| +-> new main
|
||||
|
|
||||
+-> reparented
|
||||
|
||||
Ends up as:
|
||||
|
||||
old main --------------------------->
|
||||
|
|
||||
+-> after
|
||||
|
||||
+-> another branch with same ancestor_lsn as new main
|
||||
|
|
||||
new main -------|---------|->
|
||||
|
|
||||
+-> reparented
|
||||
|
||||
We confirm the end result by being able to delete "old main" after deleting "after".
|
||||
"""
|
||||
|
||||
# TODO: support not yet implemented for these
|
||||
write_to_branch_first = True
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
ep.safe_psql("CREATE TABLE foo (i BIGINT);")
|
||||
ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;")
|
||||
|
||||
branchpoint_pipe = wait_for_last_flush_lsn(
|
||||
env, ep, env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
|
||||
|
||||
branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# as this only gets reparented, we don't need to write to it like new main
|
||||
reparented = env.neon_cli.create_branch(
|
||||
"reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe
|
||||
)
|
||||
|
||||
same_branchpoint = env.neon_cli.create_branch(
|
||||
"same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x
|
||||
)
|
||||
|
||||
timeline_id = env.neon_cli.create_branch(
|
||||
"new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x
|
||||
)
|
||||
|
||||
after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
|
||||
|
||||
if write_to_branch_first:
|
||||
with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192
|
||||
with ep.cursor() as cur:
|
||||
cur.execute("UPDATE audit SET starts = starts + 1")
|
||||
assert cur.rowcount == 1
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
|
||||
|
||||
client.timeline_checkpoint(env.initial_tenant, timeline_id)
|
||||
|
||||
all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
|
||||
assert all_reparented == {reparented, same_branchpoint}
|
||||
|
||||
if restart_after:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
env.pageserver.quiesce_tenants()
|
||||
|
||||
# checking the ancestor after is much faster than waiting for the endpoint not start
|
||||
expected_result = [
|
||||
("main", env.initial_timeline, None, 16384, 1),
|
||||
("after", after, env.initial_timeline, 16384, 1),
|
||||
("new main", timeline_id, None, 8192, 2),
|
||||
("same_branchpoint", same_branchpoint, timeline_id, 8192, 1),
|
||||
("reparented", reparented, timeline_id, 0, 1),
|
||||
]
|
||||
|
||||
for _, timeline_id, expected_ancestor, _, _ in expected_result:
|
||||
details = client.timeline_detail(env.initial_tenant, timeline_id)
|
||||
ancestor_timeline_id = details["ancestor_timeline_id"]
|
||||
if expected_ancestor is None:
|
||||
assert ancestor_timeline_id is None
|
||||
else:
|
||||
assert TimelineId(ancestor_timeline_id) == expected_ancestor
|
||||
|
||||
for name, _, _, rows, starts in expected_result:
|
||||
with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1
|
||||
|
||||
# delete the timelines to confirm detach actually worked
|
||||
client.timeline_delete(env.initial_tenant, after)
|
||||
wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0)
|
||||
|
||||
client.timeline_delete(env.initial_tenant, env.initial_timeline)
|
||||
wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("restart_after", [True, False])
|
||||
def test_detached_receives_flushes_while_being_detached(
|
||||
neon_env_builder: NeonEnvBuilder, restart_after: bool
|
||||
):
|
||||
"""
|
||||
Makes sure that the timeline is able to receive writes through-out the detach process.
|
||||
"""
|
||||
write_to_branch_first = True
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
# row counts have been manually verified to cause reconnections and getpage
|
||||
# requests when restart_after=False with pg16
|
||||
def insert_rows(n: int, ep) -> int:
|
||||
ep.safe_psql(
|
||||
f"INSERT INTO foo SELECT i::bigint, 'more info!! this is a long string' || i FROM generate_series(0, {n - 1}) g(i);"
|
||||
)
|
||||
return n
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE foo (i BIGINT, aux TEXT NOT NULL);")
|
||||
|
||||
rows = insert_rows(256, ep)
|
||||
|
||||
branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
timeline_id = env.neon_cli.create_branch(
|
||||
"new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint
|
||||
)
|
||||
|
||||
log.info("starting the new main endpoint")
|
||||
ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant)
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
|
||||
if write_to_branch_first:
|
||||
rows += insert_rows(256, ep)
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
|
||||
client.timeline_checkpoint(env.initial_tenant, timeline_id)
|
||||
log.info("completed {write_to_branch_first=}")
|
||||
|
||||
def small_txs(ep, queue: Queue[str], barrier):
|
||||
extra_rows = 0
|
||||
|
||||
with ep.connect() as conn:
|
||||
while True:
|
||||
try:
|
||||
queue.get_nowait()
|
||||
break
|
||||
except Empty:
|
||||
pass
|
||||
|
||||
if barrier is not None:
|
||||
barrier.wait()
|
||||
barrier = None
|
||||
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT INTO foo(i, aux) VALUES (1, 'more info!! this is a long string' || 1);"
|
||||
)
|
||||
extra_rows += 1
|
||||
return extra_rows
|
||||
|
||||
with ThreadPoolExecutor(max_workers=1) as exec:
|
||||
queue: Queue[str] = Queue()
|
||||
barrier = Barrier(2)
|
||||
|
||||
completion = exec.submit(small_txs, ep, queue, barrier)
|
||||
barrier.wait()
|
||||
|
||||
reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
|
||||
assert len(reparented) == 0
|
||||
|
||||
if restart_after:
|
||||
# ep and row production is kept alive on purpose
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
env.pageserver.quiesce_tenants()
|
||||
|
||||
queue.put("done")
|
||||
extra_rows = completion.result()
|
||||
assert extra_rows > 0, "some rows should had been written"
|
||||
rows += extra_rows
|
||||
|
||||
assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None
|
||||
|
||||
assert ep.safe_psql("SELECT clear_buffer_cache();")
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
|
||||
ep.stop()
|
||||
|
||||
# finally restart the endpoint and make sure we still have the same answer
|
||||
with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
|
||||
# TODO:
|
||||
# - after starting the operation, tenant is deleted
|
||||
# - after starting the operation, pageserver is shutdown, restarted
|
||||
# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
|
||||
# - deletion of reparented while reparenting should fail once, then succeed (?)
|
||||
# - branch near existing L1 boundary, image layers?
|
||||
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.
|
||||
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
|
||||
"v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
|
||||
"v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"]
|
||||
"postgres-v16": "8ef3c33aa01631e17cb24a122776349fcc777b46",
|
||||
"postgres-v15": "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a",
|
||||
"postgres-v14": "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ bytes = { version = "1", features = ["serde"] }
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
|
||||
clap = { version = "4", features = ["derive", "string"] }
|
||||
clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
|
||||
crossbeam-epoch = { version = "0.9" }
|
||||
crossbeam-utils = { version = "0.8" }
|
||||
either = { version = "1" }
|
||||
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
|
||||
|
||||
Reference in New Issue
Block a user