mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-01 09:40:38 +00:00
Compare commits
34 Commits
release-pr
...
problame/p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1e21c7705 | ||
|
|
004af53035 | ||
|
|
d8702dd819 | ||
|
|
5f04224817 | ||
|
|
9c547da6a6 | ||
|
|
6d343feef0 | ||
|
|
f73c8c6bd6 | ||
|
|
51224c84c2 | ||
|
|
6bccd64514 | ||
|
|
28c95e4207 | ||
|
|
aacf8110a0 | ||
|
|
70977afd07 | ||
|
|
7363b44b50 | ||
|
|
cc64e1b17f | ||
|
|
25dfafc2df | ||
|
|
d72fe6f5ee | ||
|
|
0bca1a5de3 | ||
|
|
511f593360 | ||
|
|
b96e0b2458 | ||
|
|
b4ed3b15b9 | ||
|
|
ad185dd594 | ||
|
|
58055c7a96 | ||
|
|
ec04f0f4d4 | ||
|
|
a52b563b59 | ||
|
|
89afba066c | ||
|
|
6bcb0959ad | ||
|
|
8f3051b416 | ||
|
|
998dc6255e | ||
|
|
700aa96770 | ||
|
|
4a72fe0908 | ||
|
|
923cdff13d | ||
|
|
498edfc0ff | ||
|
|
d2e2a88737 | ||
|
|
6f720eb38f |
21
.github/workflows/build_and_test.yml
vendored
21
.github/workflows/build_and_test.yml
vendored
@@ -236,6 +236,27 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Check Postgres submodules revision
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
|
||||
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
|
||||
|
||||
FAILED=false
|
||||
for postgres in postgres-v14 postgres-v15 postgres-v16; do
|
||||
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
|
||||
actual=$(git rev-parse "HEAD:vendor/${postgres}")
|
||||
if [ "${expected}" != "${actual}" ]; then
|
||||
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
|
||||
FAILED=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "${FAILED}" = "true" ]; then
|
||||
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -4372,7 +4372,6 @@ dependencies = [
|
||||
"hyper 1.2.0",
|
||||
"hyper-tungstenite",
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools",
|
||||
"lasso",
|
||||
|
||||
@@ -99,7 +99,6 @@ humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.13.0"
|
||||
indexmap = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
|
||||
11
Makefile
11
Makefile
@@ -81,14 +81,11 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
||||
exit 1; }
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
|
||||
|
||||
VERSION=$*; \
|
||||
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
|
||||
CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
|
||||
|
||||
# nicer alias to run 'configure'
|
||||
# Note: I've been unable to use templates for this part of our configuration.
|
||||
|
||||
@@ -51,7 +51,6 @@ use tracing::{error, info, warn};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
use compute_tools::compute::{
|
||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||
@@ -70,34 +69,6 @@ use compute_tools::swap::resize_swap;
|
||||
const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let (build_tag, clap_args) = init()?;
|
||||
|
||||
let (pg_handle, start_pg_result) = {
|
||||
// Enter startup tracing context
|
||||
let _startup_context_guard = startup_context_from_env();
|
||||
|
||||
let cli_args = process_cli(&clap_args)?;
|
||||
|
||||
let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
|
||||
|
||||
let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
|
||||
|
||||
start_postgres(&clap_args, wait_spec_result)?
|
||||
|
||||
// Startup is finished, exit the startup tracing span
|
||||
};
|
||||
|
||||
// PostgreSQL is now running, if startup was successful. Wait until it exits.
|
||||
let wait_pg_result = wait_postgres(pg_handle)?;
|
||||
|
||||
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
|
||||
|
||||
maybe_delay_exit(delay_exit);
|
||||
|
||||
deinit_and_exit(wait_pg_result);
|
||||
}
|
||||
|
||||
fn init() -> Result<(String, clap::ArgMatches)> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
@@ -112,15 +83,9 @@ fn init() -> Result<(String, clap::ArgMatches)> {
|
||||
.to_string();
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
Ok((build_tag, cli().get_matches()))
|
||||
}
|
||||
|
||||
fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
let pgbin_default = "postgres";
|
||||
let pgbin = matches
|
||||
.get_one::<String>("pgbin")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(pgbin_default);
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let ext_remote_storage = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
@@ -148,30 +113,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
|
||||
|
||||
Ok(ProcessCliResult {
|
||||
connstr,
|
||||
pgdata,
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
http_port,
|
||||
spec_json,
|
||||
spec_path,
|
||||
resize_swap_on_bind,
|
||||
})
|
||||
}
|
||||
|
||||
struct ProcessCliResult<'clap> {
|
||||
connstr: &'clap str,
|
||||
pgdata: &'clap str,
|
||||
pgbin: &'clap str,
|
||||
ext_remote_storage: Option<&'clap str>,
|
||||
http_port: u16,
|
||||
spec_json: Option<&'clap String>,
|
||||
spec_path: Option<&'clap String>,
|
||||
resize_swap_on_bind: bool,
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -208,7 +149,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
@@ -218,17 +159,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
fn try_spec_from_cli(
|
||||
matches: &clap::ArgMatches,
|
||||
ProcessCliResult {
|
||||
spec_json,
|
||||
spec_path,
|
||||
..
|
||||
}: &ProcessCliResult,
|
||||
) -> Result<CliSpecParams> {
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
@@ -269,34 +201,6 @@ fn try_spec_from_cli(
|
||||
}
|
||||
};
|
||||
|
||||
Ok(CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
})
|
||||
}
|
||||
|
||||
struct CliSpecParams {
|
||||
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||
spec: Option<ComputeSpec>,
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
fn wait_spec(
|
||||
build_tag: String,
|
||||
ProcessCliResult {
|
||||
connstr,
|
||||
pgdata,
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
resize_swap_on_bind,
|
||||
http_port,
|
||||
..
|
||||
}: ProcessCliResult,
|
||||
CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
}: CliSpecParams,
|
||||
) -> Result<WaitSpecResult> {
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
@@ -335,6 +239,8 @@ fn wait_spec(
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
@@ -363,29 +269,6 @@ fn wait_spec(
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
Ok(WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
})
|
||||
}
|
||||
|
||||
struct WaitSpecResult {
|
||||
compute: Arc<ComputeNode>,
|
||||
// passed through from ProcessCliResult
|
||||
http_port: u16,
|
||||
resize_swap_on_bind: bool,
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
// need to allow unused because `matches` is only used if target_os = "linux"
|
||||
#[allow(unused_variables)] matches: &clap::ArgMatches,
|
||||
WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
}: WaitSpecResult,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.status = ComputeStatus::Init;
|
||||
@@ -435,10 +318,10 @@ fn start_postgres(
|
||||
}
|
||||
}
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
// Start Postgres
|
||||
let mut pg = None;
|
||||
let mut exit_code = None;
|
||||
|
||||
if !prestartup_failed {
|
||||
pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
@@ -493,7 +376,7 @@ fn start_postgres(
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
let vm_monitor = rt.as_ref().map(|rt| {
|
||||
let vm_monitor = &rt.as_ref().map(|rt| {
|
||||
rt.spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
@@ -506,41 +389,12 @@ fn start_postgres(
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
pg,
|
||||
StartPostgresResult {
|
||||
delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
|
||||
|
||||
struct StartPostgresResult {
|
||||
delay_exit: bool,
|
||||
// passed through from WaitSpecResult
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
rt: Option<tokio::runtime::Runtime>,
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
let mut exit_code = None;
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
// Startup is finished, exit the startup tracing span
|
||||
drop(startup_context_guard);
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
@@ -555,25 +409,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
Ok(WaitPostgresResult { exit_code })
|
||||
}
|
||||
|
||||
struct WaitPostgresResult {
|
||||
exit_code: Option<i32>,
|
||||
}
|
||||
|
||||
fn cleanup_after_postgres_exit(
|
||||
StartPostgresResult {
|
||||
mut delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
}: StartPostgresResult,
|
||||
) -> Result<bool> {
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
@@ -615,19 +450,13 @@ fn cleanup_after_postgres_exit(
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
|
||||
Ok(delay_exit)
|
||||
}
|
||||
|
||||
fn maybe_delay_exit(delay_exit: bool) {
|
||||
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
||||
// control plane can get the actual error.
|
||||
if delay_exit {
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
}
|
||||
}
|
||||
|
||||
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||
// hang for quite some time, see, for example:
|
||||
|
||||
@@ -382,7 +382,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
// Initialize pageserver, create initial tenant and timeline.
|
||||
for ps_conf in &env.pageservers {
|
||||
PageServerNode::from_env(&env, ps_conf)
|
||||
.initialize(pageserver_config.clone())
|
||||
.initialize(&pageserver_config)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("pageserver init failed: {e:?}");
|
||||
exit(1);
|
||||
@@ -835,8 +835,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||
|
||||
let mode = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||
(None, true) => ComputeMode::Replica,
|
||||
@@ -854,9 +852,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
_ => {}
|
||||
}
|
||||
|
||||
if !allow_multiple {
|
||||
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
||||
}
|
||||
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
||||
|
||||
cplane.new_endpoint(
|
||||
&endpoint_id,
|
||||
@@ -885,8 +881,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||
|
||||
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
@@ -912,13 +906,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
if !allow_multiple {
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
endpoint.timeline_id,
|
||||
)?;
|
||||
}
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
endpoint.timeline_id,
|
||||
)?;
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
@@ -1434,12 +1426,6 @@ fn cli() -> Command {
|
||||
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
|
||||
.required(false);
|
||||
|
||||
let allow_multiple = Arg::new("allow-multiple")
|
||||
.help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
|
||||
.long("allow-multiple")
|
||||
.action(ArgAction::SetTrue)
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1602,7 +1588,6 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(hot_standby_arg.clone())
|
||||
.arg(update_catalog)
|
||||
.arg(allow_multiple.clone())
|
||||
)
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
@@ -1611,7 +1596,6 @@ fn cli() -> Command {
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
.arg(allow_multiple.clone())
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
|
||||
@@ -562,10 +562,6 @@ impl LocalEnv {
|
||||
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
|
||||
}
|
||||
|
||||
for ps in &self.pageservers {
|
||||
fs::create_dir(self.pageserver_data_dir(ps.id))?;
|
||||
}
|
||||
|
||||
self.persist_config(base_path)
|
||||
}
|
||||
|
||||
|
||||
@@ -10,15 +10,14 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{
|
||||
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
|
||||
TimelineInfo,
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -74,12 +73,10 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
fn pageserver_init_make_toml(
|
||||
&self,
|
||||
cli_overrides: toml_edit::Document,
|
||||
) -> anyhow::Result<toml_edit::Document> {
|
||||
// TODO: this is a legacy code, it should be refactored to use toml_edit directly.
|
||||
|
||||
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
|
||||
///
|
||||
/// These all end up on the command line of the `pageserver` binary.
|
||||
fn neon_local_overrides(&self, cli_overrides: &toml_edit::Document) -> Vec<String> {
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param = format!(
|
||||
"pg_distrib_dir='{}'",
|
||||
@@ -174,21 +171,12 @@ impl PageServerNode {
|
||||
// Apply the user-provided overrides
|
||||
overrides.push(cli_overrides.to_string());
|
||||
|
||||
// Turn `overrides` into a toml document.
|
||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||
let mut config_toml = toml_edit::Document::new();
|
||||
for fragment_str in overrides {
|
||||
let fragment = toml_edit::Document::from_str(&fragment_str)
|
||||
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
|
||||
for (key, item) in fragment.iter() {
|
||||
config_toml.insert(key, item.clone());
|
||||
}
|
||||
}
|
||||
Ok(config_toml)
|
||||
overrides
|
||||
}
|
||||
|
||||
/// Initializes a pageserver node by creating its config with the overrides provided.
|
||||
pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> {
|
||||
pub fn initialize(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
|
||||
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
|
||||
self.pageserver_init(config_overrides)
|
||||
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
|
||||
}
|
||||
@@ -209,7 +197,7 @@ impl PageServerNode {
|
||||
self.start_node().await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> {
|
||||
fn pageserver_init(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
|
||||
let datadir = self.repo_path();
|
||||
let node_id = self.conf.id;
|
||||
println!(
|
||||
@@ -220,20 +208,36 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
|
||||
let config = self
|
||||
.pageserver_init_make_toml(cli_overrides)
|
||||
.context("make pageserver toml")?;
|
||||
let config_file_path = datadir.join("pageserver.toml");
|
||||
let mut config_file = std::fs::OpenOptions::new()
|
||||
.create_new(true)
|
||||
.write(true)
|
||||
.open(&config_file_path)
|
||||
.with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
|
||||
config_file
|
||||
.write_all(config.to_string().as_bytes())
|
||||
.context("write pageserver toml")?;
|
||||
drop(config_file);
|
||||
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
|
||||
if !datadir.exists() {
|
||||
std::fs::create_dir(&datadir)?;
|
||||
}
|
||||
|
||||
let datadir_path_str = datadir.to_str().with_context(|| {
|
||||
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
|
||||
})?;
|
||||
|
||||
// `pageserver --init` merges the `--config-override`s into a built-in default config,
|
||||
// then writes out the merged product to `pageserver.toml`.
|
||||
// TODO: just write the full `pageserver.toml` and get rid of `--config-override`.
|
||||
let mut args = vec!["--init", "--workdir", datadir_path_str];
|
||||
let overrides = self.neon_local_overrides(config_overrides);
|
||||
for piece in &overrides {
|
||||
args.push("--config-override");
|
||||
args.push(piece);
|
||||
}
|
||||
let init_output = Command::new(self.env.pageserver_bin())
|
||||
.args(args)
|
||||
.envs(self.pageserver_env_variables()?)
|
||||
.output()
|
||||
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
|
||||
|
||||
anyhow::ensure!(
|
||||
init_output.status.success(),
|
||||
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
|
||||
node_id,
|
||||
String::from_utf8_lossy(&init_output.stdout),
|
||||
String::from_utf8_lossy(&init_output.stderr),
|
||||
);
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
// the storage controller
|
||||
@@ -425,11 +429,11 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
switch_to_aux_file_v2: settings
|
||||
.remove("switch_to_aux_file_v2")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -548,11 +552,11 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
switch_to_aux_file_v2: settings
|
||||
.remove("switch_to_aux_file_v2")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -480,15 +480,6 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.remove_metric(id)
|
||||
}
|
||||
|
||||
pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
|
||||
let id = self.vec.with_labels(labels);
|
||||
let metric = self.vec.get_metric(id);
|
||||
|
||||
let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
|
||||
let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
|
||||
inc.saturating_sub(dec)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
||||
|
||||
@@ -240,7 +240,7 @@ impl<'a> ShardedRange<'a> {
|
||||
/// pages that would not actually be stored on this node.
|
||||
///
|
||||
/// Don't use this function in code that works with physical entities like layer files.
|
||||
pub fn raw_size(range: &Range<Key>) -> u32 {
|
||||
fn raw_size(range: &Range<Key>) -> u32 {
|
||||
if is_contiguous_range(range) {
|
||||
contiguous_range_len(range)
|
||||
} else {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
pub mod detach_ancestor;
|
||||
pub mod partitioning;
|
||||
pub mod utilization;
|
||||
|
||||
@@ -9,7 +8,6 @@ use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -305,31 +303,7 @@ pub struct TenantConfig {
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AuxFilePolicy {
|
||||
V1,
|
||||
V2,
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl FromStr for AuxFilePolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let s = s.to_lowercase();
|
||||
if s == "v1" {
|
||||
Ok(Self::V1)
|
||||
} else if s == "v2" {
|
||||
Ok(Self::V2)
|
||||
} else if s == "crossvalidation" || s == "cross_validation" {
|
||||
Ok(Self::CrossValidation)
|
||||
} else {
|
||||
anyhow::bail!("cannot parse {} to aux file policy", s)
|
||||
}
|
||||
}
|
||||
pub switch_to_aux_file_v2: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
use utils::id::TimelineId;
|
||||
|
||||
#[derive(Default, serde::Serialize)]
|
||||
pub struct AncestorDetached {
|
||||
pub reparented_timelines: Vec<TimelineId>,
|
||||
}
|
||||
@@ -331,10 +331,7 @@ impl CheckPoint {
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(
|
||||
xid.wrapping_add(1),
|
||||
pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||
);
|
||||
let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
||||
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||
new_xid =
|
||||
@@ -370,16 +367,8 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
|
||||
let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
let first_page_only = seg_off < XLOG_BLCKSZ;
|
||||
// If first records starts in the middle of the page, pretend in page header
|
||||
// there is a fake record which ends where first real record starts. This
|
||||
// makes pg_waldump etc happy.
|
||||
let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
|
||||
assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
|
||||
// xlp_rem_len doesn't include page header, hence the subtraction.
|
||||
(
|
||||
seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
|
||||
pg_constants::XLP_FIRST_IS_CONTRECORD,
|
||||
)
|
||||
let (shdr_rem_len, infoflags) = if first_page_only {
|
||||
(seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
|
||||
} else {
|
||||
(0, 0)
|
||||
};
|
||||
@@ -408,22 +397,20 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
|
||||
|
||||
if !first_page_only {
|
||||
let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
|
||||
// see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
|
||||
let (xlp_rem_len, xlp_info) = if page_off > 0 {
|
||||
assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
|
||||
(
|
||||
(page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
|
||||
pg_constants::XLP_FIRST_IS_CONTRECORD,
|
||||
)
|
||||
} else {
|
||||
(0, 0)
|
||||
};
|
||||
let header = XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info,
|
||||
xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
||||
pg_constants::XLP_FIRST_IS_CONTRECORD
|
||||
} else {
|
||||
0
|
||||
},
|
||||
xlp_tli: PG_TLI,
|
||||
xlp_pageaddr: lsn.page_lsn().0,
|
||||
xlp_rem_len,
|
||||
xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
||||
page_off as u32
|
||||
} else {
|
||||
0u32
|
||||
},
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
};
|
||||
let hdr_bytes = header.encode()?;
|
||||
|
||||
@@ -50,14 +50,6 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).update_donor(&mut (*donor), donor_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
@@ -399,7 +391,6 @@ pub(crate) fn create_api() -> walproposer_api {
|
||||
get_shmem_state: Some(get_shmem_state),
|
||||
start_streaming: Some(start_streaming),
|
||||
get_flush_rec_ptr: Some(get_flush_rec_ptr),
|
||||
update_donor: Some(update_donor),
|
||||
get_current_timestamp: Some(get_current_timestamp),
|
||||
conn_error_message: Some(conn_error_message),
|
||||
conn_status: Some(conn_status),
|
||||
@@ -430,32 +421,6 @@ pub(crate) fn create_api() -> walproposer_api {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
let empty_feedback = crate::bindings::PageserverFeedback {
|
||||
present: false,
|
||||
currentClusterSize: 0,
|
||||
last_received_lsn: 0,
|
||||
disk_consistent_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
replytime: 0,
|
||||
shard_number: 0,
|
||||
};
|
||||
|
||||
crate::bindings::WalproposerShmemState {
|
||||
propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
donor_name: [0; 64],
|
||||
donor_conninfo: [0; 1024],
|
||||
donor_lsn: 0,
|
||||
mutex: 0,
|
||||
mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
shard_ps_feedback: [empty_feedback; 128],
|
||||
num_shards: 0,
|
||||
min_ps_feedback: empty_feedback,
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Level {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use std::ffi::CString;
|
||||
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
api_bindings::{create_api, take_vec_u8, Level},
|
||||
bindings::{
|
||||
@@ -7,8 +10,6 @@ use crate::{
|
||||
WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
|
||||
},
|
||||
};
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
/// Rust high-level wrapper for C walproposer API. Many methods are not required
|
||||
/// for simple cases, hence todo!() in default implementations.
|
||||
@@ -27,10 +28,6 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
todo!()
|
||||
}
|
||||
@@ -277,7 +274,6 @@ mod tests {
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
};
|
||||
|
||||
use std::cell::UnsafeCell;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||
@@ -301,8 +297,6 @@ mod tests {
|
||||
replies_ptr: AtomicUsize,
|
||||
// channel to send LSN to the main thread
|
||||
sync_channel: std::sync::mpsc::SyncSender<u64>,
|
||||
// Shmem state, used for storing donor info
|
||||
shmem: UnsafeCell<crate::bindings::WalproposerShmemState>,
|
||||
}
|
||||
|
||||
impl MockImpl {
|
||||
@@ -333,22 +327,11 @@ mod tests {
|
||||
}
|
||||
|
||||
impl ApiImpl for MockImpl {
|
||||
fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
|
||||
self.shmem.get()
|
||||
}
|
||||
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
println!("get_current_timestamp");
|
||||
0
|
||||
}
|
||||
|
||||
fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) {
|
||||
let mut shmem = unsafe { *self.get_shmem_state() };
|
||||
shmem.propEpochStartLsn.value = donor_lsn;
|
||||
shmem.donor_conninfo = donor.conninfo;
|
||||
shmem.donor_lsn = donor_lsn;
|
||||
}
|
||||
|
||||
fn conn_status(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
@@ -524,7 +507,6 @@ mod tests {
|
||||
],
|
||||
replies_ptr: AtomicUsize::new(0),
|
||||
sync_channel: sender,
|
||||
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
||||
});
|
||||
let config = crate::walproposer::Config {
|
||||
ttid,
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use bytes::{Buf, BufMut, Bytes};
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
|
||||
use tracing::warn;
|
||||
|
||||
@@ -62,84 +61,6 @@ pub fn encode_aux_file_key(path: &str) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
|
||||
|
||||
pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
|
||||
let mut ptr = val;
|
||||
if ptr.is_empty() {
|
||||
// empty value = no files
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
assert_eq!(
|
||||
ptr.get_u8(),
|
||||
AUX_FILE_ENCODING_VERSION,
|
||||
"unsupported aux file value"
|
||||
);
|
||||
let mut files = vec![];
|
||||
while ptr.has_remaining() {
|
||||
let key_len = ptr.get_u32() as usize;
|
||||
let key = &ptr[..key_len];
|
||||
ptr.advance(key_len);
|
||||
let val_len = ptr.get_u32() as usize;
|
||||
let content = &ptr[..val_len];
|
||||
ptr.advance(val_len);
|
||||
|
||||
let path = std::str::from_utf8(key)?;
|
||||
files.push((path, content));
|
||||
}
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
|
||||
/// to the original value slice. Be cautious about memory consumption.
|
||||
pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
|
||||
let mut ptr = val.clone();
|
||||
if ptr.is_empty() {
|
||||
// empty value = no files
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
assert_eq!(
|
||||
ptr.get_u8(),
|
||||
AUX_FILE_ENCODING_VERSION,
|
||||
"unsupported aux file value"
|
||||
);
|
||||
let mut files = vec![];
|
||||
while ptr.has_remaining() {
|
||||
let key_len = ptr.get_u32() as usize;
|
||||
let key = ptr.slice(..key_len);
|
||||
ptr.advance(key_len);
|
||||
let val_len = ptr.get_u32() as usize;
|
||||
let content = ptr.slice(..val_len);
|
||||
ptr.advance(val_len);
|
||||
|
||||
let path = std::str::from_utf8(&key)?.to_string();
|
||||
files.push((path, content));
|
||||
}
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
|
||||
if files.is_empty() {
|
||||
// no files = empty value
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let mut encoded = vec![];
|
||||
encoded.put_u8(AUX_FILE_ENCODING_VERSION);
|
||||
for (path, content) in files {
|
||||
if path.len() > u32::MAX as usize {
|
||||
anyhow::bail!("{} exceeds path size limit", path);
|
||||
}
|
||||
encoded.put_u32(path.len() as u32);
|
||||
encoded.put_slice(path.as_bytes());
|
||||
if content.len() > u32::MAX as usize {
|
||||
anyhow::bail!("{} exceeds content size limit", path);
|
||||
}
|
||||
encoded.put_u32(content.len() as u32);
|
||||
encoded.put_slice(content);
|
||||
}
|
||||
Ok(encoded)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -188,21 +109,4 @@ mod tests {
|
||||
encode_aux_file_key("other_file_not_supported").to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_value_encoding() {
|
||||
let files = vec![
|
||||
("pg_logical/1.file", "1111".as_bytes()),
|
||||
("pg_logical/2.file", "2222".as_bytes()),
|
||||
];
|
||||
assert_eq!(
|
||||
files,
|
||||
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
|
||||
);
|
||||
let files = vec![];
|
||||
assert_eq!(
|
||||
files,
|
||||
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -540,12 +540,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.secondary_tenant
|
||||
.evict_layer(
|
||||
tenant_manager.get_conf(),
|
||||
layer.timeline_id,
|
||||
layer.name,
|
||||
layer.metadata,
|
||||
)
|
||||
.evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
|
||||
.await;
|
||||
Ok(file_size)
|
||||
});
|
||||
|
||||
@@ -63,7 +63,6 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::SpawnMode;
|
||||
@@ -1229,15 +1228,13 @@ async fn layer_download_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let layer_name = LayerFileName::from_str(layer_file_name)
|
||||
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let downloaded = timeline
|
||||
.download_layer(&layer_name)
|
||||
.download_layer(layer_file_name)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1261,14 +1258,11 @@ async fn evict_timeline_layer_handler(
|
||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let layer_name = LayerFileName::from_str(layer_file_name)
|
||||
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let evicted = timeline
|
||||
.evict_layer(&layer_name)
|
||||
.evict_layer(layer_file_name)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1833,75 +1827,6 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
json_response(StatusCode::OK, info)
|
||||
}
|
||||
|
||||
async fn timeline_detach_ancestor_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use crate::tenant::timeline::detach_ancestor::Options;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
|
||||
|
||||
async move {
|
||||
let mut options = Options::default();
|
||||
|
||||
let rewrite_concurrency =
|
||||
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
|
||||
let copy_concurrency =
|
||||
parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?;
|
||||
|
||||
[
|
||||
(&mut options.rewrite_concurrency, rewrite_concurrency),
|
||||
(&mut options.copy_concurrency, copy_concurrency),
|
||||
]
|
||||
.into_iter()
|
||||
.filter_map(|(target, val)| val.map(|val| (target, val)))
|
||||
.for_each(|(target, val)| *target = val);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let (_guard, prepared) = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = state
|
||||
.tenant_manager
|
||||
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(reparented_timelines) => {
|
||||
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
|
||||
reparented_timelines,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
Err(e) => Err(ApiError::InternalServerError(
|
||||
e.context("timeline detach completion"),
|
||||
)),
|
||||
}
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn deletion_queue_flush(
|
||||
r: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
@@ -2590,10 +2515,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
|
||||
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor",
|
||||
|r| api_handler(r, timeline_detach_ancestor_handler),
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_delete_handler)
|
||||
})
|
||||
|
||||
@@ -1512,80 +1512,29 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
|
||||
});
|
||||
|
||||
pub(crate) struct TenantManagerMetrics {
|
||||
tenant_slots_attached: UIntGauge,
|
||||
tenant_slots_secondary: UIntGauge,
|
||||
tenant_slots_inprogress: UIntGauge,
|
||||
pub(crate) tenant_slots: UIntGauge,
|
||||
pub(crate) tenant_slot_writes: IntCounter,
|
||||
pub(crate) unexpected_errors: IntCounter,
|
||||
}
|
||||
|
||||
impl TenantManagerMetrics {
|
||||
/// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects
|
||||
/// exactly: they track the lifetime of the slots _in the tenant map_.
|
||||
pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
|
||||
match slot {
|
||||
TenantSlot::Attached(_) => {
|
||||
self.tenant_slots_attached.inc();
|
||||
}
|
||||
TenantSlot::Secondary(_) => {
|
||||
self.tenant_slots_secondary.inc();
|
||||
}
|
||||
TenantSlot::InProgress(_) => {
|
||||
self.tenant_slots_inprogress.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
|
||||
match slot {
|
||||
TenantSlot::Attached(_) => {
|
||||
self.tenant_slots_attached.dec();
|
||||
}
|
||||
TenantSlot::Secondary(_) => {
|
||||
self.tenant_slots_secondary.dec();
|
||||
}
|
||||
TenantSlot::InProgress(_) => {
|
||||
self.tenant_slots_inprogress.dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(debug_assertions, not(test)))]
|
||||
pub(crate) fn slots_total(&self) -> u64 {
|
||||
self.tenant_slots_attached.get()
|
||||
+ self.tenant_slots_secondary.get()
|
||||
+ self.tenant_slots_inprogress.get()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
|
||||
let tenant_slots = register_uint_gauge_vec!(
|
||||
TenantManagerMetrics {
|
||||
tenant_slots: register_uint_gauge!(
|
||||
"pageserver_tenant_manager_slots",
|
||||
"How many slots currently exist, including all attached, secondary and in-progress operations",
|
||||
&["mode"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
TenantManagerMetrics {
|
||||
tenant_slots_attached: tenant_slots
|
||||
.get_metric_with_label_values(&["attached"])
|
||||
.unwrap(),
|
||||
tenant_slots_secondary: tenant_slots
|
||||
.get_metric_with_label_values(&["secondary"])
|
||||
.unwrap(),
|
||||
tenant_slots_inprogress: tenant_slots
|
||||
.get_metric_with_label_values(&["inprogress"])
|
||||
.unwrap(),
|
||||
tenant_slot_writes: register_int_counter!(
|
||||
"pageserver_tenant_manager_slot_writes",
|
||||
"Writes to a tenant slot, including all of create/attach/detach/delete"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
unexpected_errors: register_int_counter!(
|
||||
"pageserver_tenant_manager_unexpected_errors_total",
|
||||
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
.expect("failed to define a metric"),
|
||||
tenant_slot_writes: register_int_counter!(
|
||||
"pageserver_tenant_manager_slot_writes",
|
||||
"Writes to a tenant slot, including all of create/attach/detach/delete"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
unexpected_errors: register_int_counter!(
|
||||
"pageserver_tenant_manager_unexpected_errors_total",
|
||||
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct DeletionQueueMetrics {
|
||||
@@ -2326,7 +2275,6 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
struct PerTimelineRemotePhysicalSizeGauge {
|
||||
@@ -2929,8 +2877,6 @@ pub fn preinitialize_metrics() {
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
|
||||
&REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
&REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
|
||||
@@ -10,9 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::repository::*;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
@@ -24,7 +24,6 @@ use pageserver_api::key::{
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -671,7 +670,7 @@ impl Timeline {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
async fn list_aux_files_v1(
|
||||
pub(crate) async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -689,63 +688,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_aux_files_v2(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
|
||||
.await
|
||||
.context("scan")?;
|
||||
let mut result = HashMap::new();
|
||||
for (_, v) in kv {
|
||||
let v = v.context("get value")?;
|
||||
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
|
||||
for (fname, content) in v {
|
||||
result.insert(fname, content);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get_switch_aux_file_policy() {
|
||||
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
|
||||
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
|
||||
AuxFilePolicy::CrossValidation => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
|
||||
match (v1_result, v2_result) {
|
||||
(Ok(v1), Ok(v2)) => {
|
||||
if v1 != v2 {
|
||||
tracing::error!(
|
||||
"unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
|
||||
);
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"unmatched aux file v1 v2 result"
|
||||
)));
|
||||
}
|
||||
Ok(v1)
|
||||
}
|
||||
(Ok(_), Err(v2)) => {
|
||||
tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
|
||||
Err(v2)
|
||||
}
|
||||
(Err(v1), Ok(_)) => {
|
||||
tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
|
||||
Err(v1)
|
||||
}
|
||||
(Err(_), Err(v2)) => Err(v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used to initialize the logical size tracking on startup.
|
||||
///
|
||||
@@ -1447,9 +1389,6 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
|
||||
if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
|
||||
return Ok(());
|
||||
}
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
@@ -1465,122 +1404,90 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let policy = self.tline.get_switch_aux_file_policy();
|
||||
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
Ok(val) => Some(val),
|
||||
Err(PageReconstructError::MissingKey(_)) => None,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
let new_files = if content.is_empty() {
|
||||
files
|
||||
.into_iter()
|
||||
.filter(|(p, _)| &path != p)
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
files
|
||||
.into_iter()
|
||||
.filter(|(p, _)| &path != p)
|
||||
.chain(std::iter::once((path, content)))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
}
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
aux_files.n_deltas = 0;
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
|
||||
);
|
||||
aux_files.n_deltas += 1;
|
||||
}
|
||||
aux_files.dir = Some(dir);
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
n_files = dir.files.len();
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
|
||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||
Err(
|
||||
PageReconstructError::Other(_)
|
||||
| PageReconstructError::WalRedo(_)
|
||||
| PageReconstructError::MissingKey { .. },
|
||||
) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
aux_files.n_deltas = 0;
|
||||
} else {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
|
||||
);
|
||||
aux_files.n_deltas += 1;
|
||||
}
|
||||
aux_files.dir = Some(dir);
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
n_files = dir.files.len();
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
|
||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||
Err(
|
||||
PageReconstructError::Other(_)
|
||||
| PageReconstructError::WalRedo(_)
|
||||
| PageReconstructError::MissingKey { .. },
|
||||
) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
n_files = 1;
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
n_files = 1;
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, n_files));
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, n_files));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum InvalidInput {
|
||||
TooShortValue,
|
||||
@@ -41,8 +42,10 @@ pub(crate) enum InvalidInput {
|
||||
|
||||
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||
/// use this type for querying if a slice looks some particular way.
|
||||
#[cfg(test)]
|
||||
pub(crate) struct ValueBytes;
|
||||
|
||||
#[cfg(test)]
|
||||
impl ValueBytes {
|
||||
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
|
||||
@@ -319,9 +319,6 @@ pub enum TaskKind {
|
||||
// Eviction. One per timeline.
|
||||
Eviction,
|
||||
|
||||
// Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
|
||||
IngestHousekeeping,
|
||||
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
@@ -370,8 +367,6 @@ pub enum TaskKind {
|
||||
|
||||
#[cfg(test)]
|
||||
UnitTest,
|
||||
|
||||
DetachAncestor,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -322,9 +322,6 @@ pub struct Tenant {
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) timeline_get_throttle:
|
||||
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
|
||||
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
|
||||
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Tenant {
|
||||
@@ -1679,34 +1676,6 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Call through to all timelines to freeze ephemeral layers if needed. Usually
|
||||
// this happens during ingest: this background housekeeping is for freezing layers
|
||||
// that are open but haven't been written to for some time.
|
||||
async fn ingest_housekeeping(&self) {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// compactions. We don't want to block everything else while the
|
||||
// compaction runs.
|
||||
let timelines = {
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter_map(|timeline| {
|
||||
if timeline.is_active() {
|
||||
Some(timeline.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
for timeline in &timelines {
|
||||
timeline.maybe_freeze_ephemeral_layer().await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TenantState {
|
||||
self.state.borrow().clone()
|
||||
}
|
||||
@@ -2560,7 +2529,6 @@ impl Tenant {
|
||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3758,7 +3726,7 @@ pub(crate) mod harness {
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
|
||||
switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
@@ -371,9 +370,9 @@ pub struct TenantConf {
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
pub switch_aux_file_policy: AuxFilePolicy,
|
||||
pub switch_to_aux_file_v2: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -472,7 +471,7 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
pub switch_to_aux_file_v2: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -530,9 +529,9 @@ impl TenantConfOpt {
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
switch_aux_file_policy: self
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(global_conf.switch_aux_file_policy),
|
||||
switch_to_aux_file_v2: self
|
||||
.switch_to_aux_file_v2
|
||||
.unwrap_or(global_conf.switch_to_aux_file_v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -574,7 +573,7 @@ impl Default for TenantConf {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: AuxFilePolicy::V1,
|
||||
switch_to_aux_file_v2: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -649,7 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
switch_aux_file_policy: value.switch_aux_file_policy,
|
||||
switch_to_aux_file_v2: value.switch_to_aux_file_v2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -585,20 +585,9 @@ impl DeleteTenantFlow {
|
||||
|
||||
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||
|
||||
// Update stats
|
||||
match &removed {
|
||||
TenantsMapRemoveResult::Occupied(slot) => {
|
||||
crate::metrics::TENANT_MANAGER.slot_removed(slot);
|
||||
}
|
||||
TenantsMapRemoveResult::InProgress(barrier) => {
|
||||
crate::metrics::TENANT_MANAGER
|
||||
.slot_removed(&TenantSlot::InProgress(barrier.clone()));
|
||||
}
|
||||
TenantsMapRemoveResult::Vacant => {
|
||||
// Nothing changed in map, no metric update
|
||||
}
|
||||
}
|
||||
crate::metrics::TENANT_MANAGER
|
||||
.tenant_slots
|
||||
.set(locked.len() as u64);
|
||||
|
||||
match removed {
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
|
||||
|
||||
@@ -207,24 +207,6 @@ impl TimelineMetadata {
|
||||
self.body.ancestor_lsn
|
||||
}
|
||||
|
||||
/// When reparenting, the `ancestor_lsn` does not change.
|
||||
pub fn reparent(&mut self, timeline: &TimelineId) {
|
||||
assert!(self.body.ancestor_timeline.is_some());
|
||||
// no assertion for redoing this: it's fine, we may have to repeat this multiple times over
|
||||
self.body.ancestor_timeline = Some(*timeline);
|
||||
}
|
||||
|
||||
pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
|
||||
if let Some(ancestor) = self.body.ancestor_timeline {
|
||||
assert_eq!(ancestor, *timeline);
|
||||
}
|
||||
if self.body.ancestor_lsn != Lsn(0) {
|
||||
assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
|
||||
}
|
||||
self.body.ancestor_timeline = None;
|
||||
self.body.ancestor_lsn = Lsn(0);
|
||||
}
|
||||
|
||||
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
|
||||
self.body.latest_gc_cutoff_lsn
|
||||
}
|
||||
|
||||
@@ -56,7 +56,6 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||
use super::TenantSharedResources;
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
@@ -247,7 +246,6 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(debug_assertions, not(test)))]
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
match self {
|
||||
TenantsMap::Initializing => 0,
|
||||
@@ -748,7 +746,6 @@ pub async fn init_tenant_mgr(
|
||||
}
|
||||
};
|
||||
|
||||
METRICS.slot_inserted(&slot);
|
||||
tenants.insert(tenant_shard_id, slot);
|
||||
}
|
||||
|
||||
@@ -756,7 +753,7 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
let mut tenants_map = TENANTS.write().unwrap();
|
||||
assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
|
||||
|
||||
METRICS.tenant_slots.set(tenants.len() as u64);
|
||||
*tenants_map = TenantsMap::Open(tenants);
|
||||
|
||||
Ok(TenantManager {
|
||||
@@ -827,14 +824,6 @@ fn tenant_spawn(
|
||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
let mut join_set = JoinSet::new();
|
||||
|
||||
#[cfg(all(debug_assertions, not(test)))]
|
||||
{
|
||||
// Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check,
|
||||
// as it happens implicitly at the end of tests etc.
|
||||
let m = tenants.read().unwrap();
|
||||
debug_assert_eq!(METRICS.slots_total(), m.len() as u64);
|
||||
}
|
||||
|
||||
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
|
||||
let (total_in_progress, total_attached) = {
|
||||
let mut m = tenants.write().unwrap();
|
||||
@@ -2008,101 +1997,6 @@ impl TenantManager {
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Completes an earlier prepared timeline detach ancestor.
|
||||
pub(crate) async fn complete_detaching_timeline_ancestor(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
prepared: PreparedTimelineDetach,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
struct RevertOnDropSlot(Option<SlotGuard>);
|
||||
|
||||
impl Drop for RevertOnDropSlot {
|
||||
fn drop(&mut self) {
|
||||
if let Some(taken) = self.0.take() {
|
||||
taken.revert();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RevertOnDropSlot {
|
||||
fn into_inner(mut self) -> SlotGuard {
|
||||
self.0.take().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for RevertOnDropSlot {
|
||||
type Target = SlotGuard;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
let slot_guard = RevertOnDropSlot(Some(slot_guard));
|
||||
|
||||
let tenant = {
|
||||
let Some(old_slot) = slot_guard.get_old_value() else {
|
||||
anyhow::bail!(
|
||||
"Tenant not found when trying to complete detaching timeline ancestor"
|
||||
);
|
||||
};
|
||||
|
||||
let Some(tenant) = old_slot.get_attached() else {
|
||||
anyhow::bail!("Tenant is not in attached state");
|
||||
};
|
||||
|
||||
if !tenant.is_active() {
|
||||
anyhow::bail!("Tenant is not active");
|
||||
}
|
||||
|
||||
tenant.clone()
|
||||
};
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let reparented = timeline
|
||||
.complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
|
||||
.await?;
|
||||
|
||||
let mut slot_guard = slot_guard.into_inner();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
// this really should not happen, at all, unless shutdown was already going?
|
||||
anyhow::bail!("Cannot restart Tenant, already shutting down");
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -2534,13 +2428,10 @@ impl SlotGuard {
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
|
||||
METRICS.slot_inserted(&new_value);
|
||||
|
||||
let replaced = m.insert(self.tenant_shard_id, new_value);
|
||||
self.upserted = true;
|
||||
if let Some(replaced) = replaced.as_ref() {
|
||||
METRICS.slot_removed(replaced);
|
||||
}
|
||||
|
||||
METRICS.tenant_slots.set(m.len() as u64);
|
||||
|
||||
replaced
|
||||
};
|
||||
@@ -2650,13 +2541,9 @@ impl Drop for SlotGuard {
|
||||
}
|
||||
|
||||
if self.old_value_is_shutdown() {
|
||||
METRICS.slot_removed(entry.get());
|
||||
entry.remove();
|
||||
} else {
|
||||
let inserting = self.old_value.take().unwrap();
|
||||
METRICS.slot_inserted(&inserting);
|
||||
let replaced = entry.insert(inserting);
|
||||
METRICS.slot_removed(&replaced);
|
||||
entry.insert(self.old_value.take().unwrap());
|
||||
}
|
||||
}
|
||||
Entry::Vacant(_) => {
|
||||
@@ -2667,6 +2554,8 @@ impl Drop for SlotGuard {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
METRICS.tenant_slots.set(m.len() as u64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2746,9 +2635,7 @@ fn tenant_map_acquire_slot_impl(
|
||||
}
|
||||
_ => {
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let inserting = TenantSlot::InProgress(barrier);
|
||||
METRICS.slot_inserted(&inserting);
|
||||
v.insert(inserting);
|
||||
v.insert(TenantSlot::InProgress(barrier));
|
||||
tracing::debug!("Vacant, inserted InProgress");
|
||||
Ok(SlotGuard::new(*tenant_shard_id, None, completion))
|
||||
}
|
||||
@@ -2784,10 +2671,7 @@ fn tenant_map_acquire_slot_impl(
|
||||
_ => {
|
||||
// Happy case: the slot was not in any state that violated our mode
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let in_progress = TenantSlot::InProgress(barrier);
|
||||
METRICS.slot_inserted(&in_progress);
|
||||
let old_value = o.insert(in_progress);
|
||||
METRICS.slot_removed(&old_value);
|
||||
let old_value = o.insert(TenantSlot::InProgress(barrier));
|
||||
tracing::debug!("Occupied, replaced with InProgress");
|
||||
Ok(SlotGuard::new(
|
||||
*tenant_shard_id,
|
||||
|
||||
@@ -570,7 +570,7 @@ impl RemoteTimelineClient {
|
||||
// ahead of what's _actually_ on the remote during index upload.
|
||||
upload_queue.latest_metadata = metadata.clone();
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -591,7 +591,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
upload_queue.latest_metadata.apply(update);
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -611,14 +611,18 @@ impl RemoteTimelineClient {
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background (internal function)
|
||||
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
fn schedule_index_upload(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
metadata: TimelineMetadata,
|
||||
) {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
info!(
|
||||
@@ -627,7 +631,11 @@ impl RemoteTimelineClient {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
);
|
||||
|
||||
let index_part = IndexPart::from(&*upload_queue);
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
@@ -637,61 +645,9 @@ impl RemoteTimelineClient {
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
}
|
||||
|
||||
pub(crate) async fn schedule_reparenting_and_wait(
|
||||
self: &Arc<Self>,
|
||||
new_parent: &TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
|
||||
// and reads the in-memory part we cannot do the detaching like this
|
||||
let receiver = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue.latest_metadata.reparent(new_parent);
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
|
||||
self.schedule_barrier0(upload_queue)
|
||||
};
|
||||
|
||||
Self::wait_completion0(receiver).await
|
||||
}
|
||||
|
||||
/// Schedules uploading a new version of `index_part.json` with the given layers added,
|
||||
/// detaching from ancestor and waits for it to complete.
|
||||
///
|
||||
/// This is used with `Timeline::detach_ancestor` functionality.
|
||||
pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait(
|
||||
self: &Arc<Self>,
|
||||
layers: &[Layer],
|
||||
adopted: (TimelineId, Lsn),
|
||||
) -> anyhow::Result<()> {
|
||||
let barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue
|
||||
.latest_metadata
|
||||
.detach_from_ancestor(&adopted.0, &adopted.1);
|
||||
|
||||
for layer in layers {
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(layer.layer_desc().filename(), layer.metadata());
|
||||
}
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
|
||||
let barrier = self.schedule_barrier0(upload_queue);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
barrier
|
||||
};
|
||||
|
||||
Self::wait_completion0(barrier).await
|
||||
}
|
||||
|
||||
/// Launch an upload operation in the background; the file is added to be included in next
|
||||
/// `index_part.json` upload.
|
||||
/// Launch an upload operation in the background.
|
||||
///
|
||||
pub(crate) fn schedule_layer_file_upload(
|
||||
self: &Arc<Self>,
|
||||
layer: ResidentLayer,
|
||||
@@ -717,11 +673,9 @@ impl RemoteTimelineClient {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
info!(
|
||||
gen=?metadata.generation,
|
||||
shard=?metadata.shard,
|
||||
"scheduled layer file upload {layer}",
|
||||
"scheduled layer file upload {layer} gen={:?} shard={:?}",
|
||||
metadata.generation, metadata.shard
|
||||
);
|
||||
|
||||
let op = UploadOp::UploadLayer(layer, metadata);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
@@ -784,6 +738,10 @@ impl RemoteTimelineClient {
|
||||
where
|
||||
I: IntoIterator<Item = LayerFileName>,
|
||||
{
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
|
||||
// Decorate our list of names with each name's metadata, dropping
|
||||
// names that are unexpectedly missing from our metadata. This metadata
|
||||
// is later used when physically deleting layers, to construct key paths.
|
||||
@@ -822,7 +780,7 @@ impl RemoteTimelineClient {
|
||||
// index_part update, because that needs to be uploaded before we can actually delete the
|
||||
// files.
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue, metadata);
|
||||
}
|
||||
|
||||
with_metadata
|
||||
@@ -924,18 +882,12 @@ impl RemoteTimelineClient {
|
||||
|
||||
/// Wait for all previously scheduled uploads/deletions to complete
|
||||
pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
let receiver = {
|
||||
let mut receiver = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
self.schedule_barrier0(upload_queue)
|
||||
};
|
||||
|
||||
Self::wait_completion0(receiver).await
|
||||
}
|
||||
|
||||
async fn wait_completion0(
|
||||
mut receiver: tokio::sync::watch::Receiver<()>,
|
||||
) -> anyhow::Result<()> {
|
||||
if receiver.changed().await.is_err() {
|
||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||
}
|
||||
@@ -1051,7 +1003,8 @@ impl RemoteTimelineClient {
|
||||
let deleted_at = Utc::now().naive_utc();
|
||||
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
|
||||
|
||||
let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
|
||||
let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
|
||||
.context("IndexPart serialize")?;
|
||||
index_part.deleted_at = Some(deleted_at);
|
||||
index_part
|
||||
};
|
||||
@@ -1132,93 +1085,6 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload.
|
||||
///
|
||||
/// This is not normally needed.
|
||||
pub(crate) async fn upload_layer_file(
|
||||
self: &Arc<Self>,
|
||||
uploaded: &ResidentLayer,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.tenant_shard_id.to_index(),
|
||||
&uploaded.layer_desc().filename(),
|
||||
uploaded.metadata().generation,
|
||||
);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::upload_timeline_layer(
|
||||
&self.storage_impl,
|
||||
uploaded.local_path(),
|
||||
&remote_path,
|
||||
uploaded.metadata().file_size(),
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"upload a layer without adding it to latest files",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("upload a layer without adding it to latest files")
|
||||
}
|
||||
|
||||
/// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is
|
||||
/// not added to be part of a future `index_part.json` upload.
|
||||
pub(crate) async fn copy_timeline_layer(
|
||||
self: &Arc<Self>,
|
||||
adopted: &Layer,
|
||||
adopted_as: &Layer,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let source_remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&adopted
|
||||
.get_timeline_id()
|
||||
.expect("Source timeline should be alive"),
|
||||
self.tenant_shard_id.to_index(),
|
||||
&adopted.layer_desc().filename(),
|
||||
adopted.metadata().generation,
|
||||
);
|
||||
|
||||
let target_remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.tenant_shard_id.to_index(),
|
||||
&adopted_as.layer_desc().filename(),
|
||||
adopted_as.metadata().generation,
|
||||
);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::copy_timeline_layer(
|
||||
&self.storage_impl,
|
||||
&source_remote_path,
|
||||
&target_remote_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"copy timeline layer",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remote copy timeline layer")
|
||||
}
|
||||
|
||||
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
|
||||
match tokio::time::timeout(
|
||||
DELETION_QUEUE_FLUSH_TIMEOUT,
|
||||
@@ -1390,7 +1256,7 @@ impl RemoteTimelineClient {
|
||||
while let Some(next_op) = upload_queue.queued_operations.front() {
|
||||
// Can we run this task now?
|
||||
let can_run_now = match next_op {
|
||||
UploadOp::UploadLayer(..) => {
|
||||
UploadOp::UploadLayer(_, _) => {
|
||||
// Can always be scheduled.
|
||||
true
|
||||
}
|
||||
@@ -1517,25 +1383,13 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
|
||||
let local_path = layer.local_path();
|
||||
|
||||
// We should only be uploading layers created by this `Tenant`'s lifetime, so
|
||||
// the metadata in the upload should always match our current generation.
|
||||
assert_eq!(layer_metadata.generation, self.generation);
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
layer_metadata.shard,
|
||||
&layer.layer_desc().filename(),
|
||||
layer_metadata.generation,
|
||||
);
|
||||
|
||||
let path = layer.local_path();
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
local_path,
|
||||
&remote_path,
|
||||
layer_metadata.file_size(),
|
||||
path,
|
||||
layer_metadata,
|
||||
self.generation,
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1964,6 +1818,29 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Files on the remote storage are stored with paths, relative to the workdir.
|
||||
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
|
||||
///
|
||||
/// Errors if the path provided does not start from pageserver's workdir.
|
||||
pub fn remote_path(
|
||||
conf: &PageServerConf,
|
||||
local_path: &Utf8Path,
|
||||
generation: Generation,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let stripped = local_path
|
||||
.strip_prefix(&conf.workdir)
|
||||
.context("Failed to strip workdir prefix")?;
|
||||
|
||||
let suffixed = format!("{0}{1}", stripped, generation.get_suffix());
|
||||
|
||||
RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
|
||||
format!(
|
||||
"to resolve remote part of path {:?} for base {:?}",
|
||||
local_path, conf.workdir
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1971,7 +1848,6 @@ mod tests {
|
||||
context::RequestContext,
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::layer::local_layer_path,
|
||||
Tenant, Timeline,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
@@ -2154,20 +2030,11 @@ mod tests {
|
||||
]
|
||||
.into_iter()
|
||||
.map(|(name, contents): (LayerFileName, Vec<u8>)| {
|
||||
|
||||
let local_path = local_layer_path(
|
||||
harness.conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&name,
|
||||
&generation,
|
||||
);
|
||||
std::fs::write(&local_path, &contents).unwrap();
|
||||
std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
|
||||
|
||||
Layer::for_resident(
|
||||
harness.conf,
|
||||
&timeline,
|
||||
local_path,
|
||||
name,
|
||||
LayerFileMetadata::new(contents.len() as u64, generation, shard),
|
||||
)
|
||||
@@ -2304,22 +2171,19 @@ mod tests {
|
||||
..
|
||||
} = TestSetup::new("metrics").await.unwrap();
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let local_path = local_layer_path(
|
||||
harness.conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer_file_name_1,
|
||||
&harness.generation,
|
||||
);
|
||||
let content_1 = dummy_contents("foo");
|
||||
std::fs::write(&local_path, &content_1).unwrap();
|
||||
std::fs::write(
|
||||
timeline_path.join(layer_file_name_1.file_name()),
|
||||
&content_1,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let layer_file_1 = Layer::for_resident(
|
||||
harness.conf,
|
||||
&timeline,
|
||||
local_path,
|
||||
layer_file_name_1.clone(),
|
||||
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
|
||||
);
|
||||
@@ -2388,7 +2252,12 @@ mod tests {
|
||||
|
||||
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
|
||||
// An empty IndexPart, just sufficient to ensure deserialization will succeed
|
||||
let example_index_part = IndexPart::example();
|
||||
let example_metadata = TimelineMetadata::example();
|
||||
let example_index_part = IndexPart::new(
|
||||
HashMap::new(),
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
);
|
||||
|
||||
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
@@ -56,13 +55,7 @@ pub async fn download_layer_file<'a>(
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
layer_file_name,
|
||||
&layer_metadata.generation,
|
||||
);
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::collections::HashMap;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::SerializeError;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -103,14 +104,15 @@ impl IndexPart {
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
fn new(
|
||||
layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
|
||||
pub fn new(
|
||||
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata: TimelineMetadata,
|
||||
) -> Self {
|
||||
// Transform LayerFileMetadata into IndexLayerMetadata
|
||||
let layer_metadata = layers_and_metadata
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, IndexLayerMetadata::from(v)))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
@@ -139,24 +141,20 @@ impl IndexPart {
|
||||
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||
serde_json::to_vec(self)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn example() -> Self {
|
||||
let example_metadata = TimelineMetadata::example();
|
||||
Self::new(
|
||||
&HashMap::new(),
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&UploadQueueInitialized> for IndexPart {
|
||||
fn from(uq: &UploadQueueInitialized) -> Self {
|
||||
let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
|
||||
let metadata = uq.latest_metadata.clone();
|
||||
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||
type Error = SerializeError;
|
||||
|
||||
Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
|
||||
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
|
||||
Ok(Self::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,8 +172,8 @@ pub struct IndexLayerMetadata {
|
||||
pub shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl From<&LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: &LayerFileMetadata) -> Self {
|
||||
impl From<LayerFileMetadata> for IndexLayerMetadata {
|
||||
fn from(other: LayerFileMetadata) -> Self {
|
||||
IndexLayerMetadata {
|
||||
file_size: other.file_size,
|
||||
generation: other.generation,
|
||||
|
||||
@@ -12,13 +12,18 @@ use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
|
||||
use super::Generation;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path,
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path,
|
||||
},
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
|
||||
use remote_storage::{GenericRemoteStorage, TimeTravelError};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::LayerFileMetadata;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
@@ -60,10 +65,11 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
local_path: &'a Utf8Path,
|
||||
remote_path: &'a RemotePath,
|
||||
metadata_size: u64,
|
||||
source_path: &'a Utf8Path,
|
||||
known_metadata: &'a LayerFileMetadata,
|
||||
generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-layer", |_| {
|
||||
@@ -72,7 +78,8 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
pausable_failpoint!("before-upload-layer-pausable");
|
||||
|
||||
let source_file_res = fs::File::open(&local_path).await;
|
||||
let storage_path = remote_path(conf, source_path, generation)?;
|
||||
let source_file_res = fs::File::open(&source_path).await;
|
||||
let source_file = match source_file_res {
|
||||
Ok(source_file) => source_file,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
@@ -83,49 +90,34 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
// it has been written to disk yet.
|
||||
//
|
||||
// This is tested against `test_compaction_delete_before_upload`
|
||||
info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?,
|
||||
Err(e) => {
|
||||
Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
|
||||
}
|
||||
};
|
||||
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| format!("get the source file metadata for layer {local_path:?}"))?
|
||||
.with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
|
||||
.len();
|
||||
|
||||
let metadata_size = known_metadata.file_size();
|
||||
if metadata_size != fs_size {
|
||||
bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
}
|
||||
|
||||
let fs_size = usize::try_from(fs_size)
|
||||
.with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
|
||||
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
|
||||
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload(reader, fs_size, remote_path, None, cancel)
|
||||
.upload(reader, fs_size, &storage_path, None, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{local_path}'"))
|
||||
}
|
||||
|
||||
pub(super) async fn copy_timeline_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
source_path: &RemotePath,
|
||||
target_path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-copy-layer", |_| {
|
||||
bail!("failpoint before-copy-layer")
|
||||
});
|
||||
|
||||
pausable_failpoint!("before-copy-layer-pausable");
|
||||
|
||||
storage
|
||||
.copy_object(source_path, target_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("copy layer {source_path} to {target_path}"))
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))
|
||||
}
|
||||
|
||||
/// Uploads the given `initdb` data to the remote storage.
|
||||
|
||||
@@ -21,9 +21,8 @@ use self::{
|
||||
use super::{
|
||||
config::{SecondaryLocationConfig, TenantConfOpt},
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::{layer::local_layer_path, LayerFileName},
|
||||
storage_layer::LayerFileName,
|
||||
};
|
||||
|
||||
use pageserver_api::{
|
||||
@@ -183,7 +182,6 @@ impl SecondaryTenant {
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
name: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -197,13 +195,9 @@ impl SecondaryTenant {
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&self.tenant_shard_id,
|
||||
&timeline_id,
|
||||
&name,
|
||||
&metadata.generation,
|
||||
);
|
||||
let path = conf
|
||||
.timeline_path(&self.tenant_shard_id, &timeline_id)
|
||||
.join(name.file_name());
|
||||
|
||||
let this = self.clone();
|
||||
|
||||
@@ -214,7 +208,7 @@ impl SecondaryTenant {
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
let deleted = std::fs::remove_file(local_path);
|
||||
let deleted = std::fs::remove_file(path);
|
||||
|
||||
let not_found = deleted
|
||||
.as_ref()
|
||||
|
||||
@@ -22,7 +22,7 @@ use crate::{
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
},
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::{layer::local_layer_path, LayerFileName},
|
||||
storage_layer::LayerFileName,
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
},
|
||||
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
|
||||
@@ -621,12 +621,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| (&l.name, l.metadata.generation))
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| (l.0, l.1.metadata.generation))
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let mut layer_count = layers_on_disk.len();
|
||||
@@ -637,24 +637,16 @@ impl<'a> TenantDownloader<'a> {
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
layer_count -= 1;
|
||||
layer_byte_count -= timeline_state
|
||||
.on_disk_layers
|
||||
.get(layer_file_name)
|
||||
.get(layer)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
self.secondary_state.get_tenant_shard_id(),
|
||||
timeline_id,
|
||||
layer_file_name,
|
||||
generation,
|
||||
);
|
||||
|
||||
delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path));
|
||||
delete_layers.push((*timeline_id, (*layer).clone()));
|
||||
}
|
||||
|
||||
progress.bytes_downloaded += layer_byte_count;
|
||||
@@ -669,7 +661,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
// Execute accumulated deletions
|
||||
for (timeline_id, layer_name, local_path) in delete_layers {
|
||||
for (timeline_id, layer_name) in delete_layers {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
let local_path = timeline_path.join(layer_name.to_string());
|
||||
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
@@ -758,6 +754,9 @@ impl<'a> TenantDownloader<'a> {
|
||||
) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(tenant_shard_id, &timeline.timeline_id);
|
||||
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
@@ -807,14 +806,10 @@ impl<'a> TenantDownloader<'a> {
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
let local_path = self
|
||||
.conf
|
||||
.timeline_path(tenant_shard_id, &timeline.timeline_id)
|
||||
.join(layer.name.file_name());
|
||||
match tokio::fs::metadata(&local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
@@ -908,13 +903,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
};
|
||||
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
let local_path = timeline_path.join(layer.name.to_string());
|
||||
|
||||
tracing::warn!(
|
||||
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
|
||||
|
||||
@@ -1139,15 +1139,15 @@ impl DeltaLayerInner {
|
||||
Ok(all_keys)
|
||||
}
|
||||
|
||||
/// Using the given writer, write out a version which has the earlier Lsns than `until`.
|
||||
///
|
||||
/// Return the amount of key value records pushed to the writer.
|
||||
/// Using the given writer, write out a truncated version, where LSNs higher than the
|
||||
/// truncate_at are missing.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn copy_prefix(
|
||||
&self,
|
||||
writer: &mut DeltaLayerWriter,
|
||||
until: Lsn,
|
||||
truncate_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
) -> anyhow::Result<()> {
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
|
||||
};
|
||||
@@ -1211,8 +1211,6 @@ impl DeltaLayerInner {
|
||||
// FIXME: buffering of DeltaLayerWriter
|
||||
let mut per_blob_copy = Vec::new();
|
||||
|
||||
let mut records = 0;
|
||||
|
||||
while let Some(item) = stream.try_next().await? {
|
||||
tracing::debug!(?item, "popped");
|
||||
let offset = item
|
||||
@@ -1231,7 +1229,7 @@ impl DeltaLayerInner {
|
||||
|
||||
prev = Option::from(item);
|
||||
|
||||
let actionable = actionable.filter(|x| x.0.lsn < until);
|
||||
let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
|
||||
|
||||
let builder = if let Some((meta, offsets)) = actionable {
|
||||
// extend or create a new builder
|
||||
@@ -1299,7 +1297,7 @@ impl DeltaLayerInner {
|
||||
let will_init = crate::repository::ValueBytes::will_init(data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
||||
@@ -1316,10 +1314,7 @@ impl DeltaLayerInner {
|
||||
)
|
||||
.await;
|
||||
per_blob_copy = tmp;
|
||||
|
||||
res?;
|
||||
|
||||
records += 1;
|
||||
}
|
||||
|
||||
buffer = Some(res.buf);
|
||||
@@ -1331,7 +1326,7 @@ impl DeltaLayerInner {
|
||||
"with the sentinel above loop should had handled all"
|
||||
);
|
||||
|
||||
Ok(records)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
@@ -1404,6 +1399,7 @@ impl DeltaLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn stream_index_forwards<'a, R>(
|
||||
&'a self,
|
||||
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||
|
||||
@@ -2,13 +2,11 @@
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use crate::repository::Key;
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::str::FromStr;
|
||||
|
||||
use regex::Regex;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::PersistentLayerDesc;
|
||||
@@ -76,19 +74,10 @@ impl DeltaFileName {
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_start_str = lsn_parts.next()?;
|
||||
let lsn_end_str = lsn_parts.next()?;
|
||||
|
||||
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36
|
||||
|| key_end_str.len() != 36
|
||||
|| lsn_start_str.len() != 16
|
||||
|| lsn_end_str.len() != 16
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
@@ -193,10 +182,6 @@ impl ImageFileName {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
@@ -274,22 +259,9 @@ impl From<DeltaFileName> for LayerFileName {
|
||||
impl FromStr for LayerFileName {
|
||||
type Err = String;
|
||||
|
||||
/// Conversion from either a physical layer filename, or the string-ization of
|
||||
/// Self. When loading a physical layer filename, we drop any extra information
|
||||
/// not needed to build Self.
|
||||
fn from_str(value: &str) -> Result<Self, Self::Err> {
|
||||
let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
|
||||
let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
|
||||
Some(captures) => captures
|
||||
.name("base")
|
||||
.expect("Non-optional group")
|
||||
.as_str()
|
||||
.into(),
|
||||
None => value.into(),
|
||||
};
|
||||
|
||||
let delta = DeltaFileName::parse_str(&file_name);
|
||||
let image = ImageFileName::parse_str(&file_name);
|
||||
let delta = DeltaFileName::parse_str(value);
|
||||
let image = ImageFileName::parse_str(value);
|
||||
let ok = match (delta, image) {
|
||||
(None, None) => {
|
||||
return Err(format!(
|
||||
@@ -343,42 +315,3 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
|
||||
v.parse().map_err(|e| E::custom(e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn image_layer_parse() -> anyhow::Result<()> {
|
||||
let expected = LayerFileName::Image(ImageFileName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
|
||||
});
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delta_layer_parse() -> anyhow::Result<()> {
|
||||
let expected = LayerFileName::Delta(DeltaFileName {
|
||||
key_range: Key::from_i128(0)
|
||||
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
|
||||
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
|
||||
..Lsn::from_hex("000000000154C481").unwrap(),
|
||||
});
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,13 +4,12 @@ use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
@@ -124,25 +123,6 @@ impl PartialEq for Layer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn local_layer_path(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
layer_file_name: &LayerFileName,
|
||||
_generation: &Generation,
|
||||
) -> Utf8PathBuf {
|
||||
let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
|
||||
|
||||
timeline_path.join(layer_file_name.file_name())
|
||||
|
||||
// TOOD: include generation in the name in now+1 releases.
|
||||
// timeline_path.join(format!(
|
||||
// "{}{}",
|
||||
// layer_file_name.file_name(),
|
||||
// generation.get_suffix()
|
||||
// ))
|
||||
}
|
||||
|
||||
impl Layer {
|
||||
/// Creates a layer value for a file we know to not be resident.
|
||||
pub(crate) fn for_evicted(
|
||||
@@ -151,14 +131,6 @@ impl Layer {
|
||||
file_name: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
) -> Self {
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&file_name,
|
||||
&metadata.generation,
|
||||
);
|
||||
|
||||
let desc = PersistentLayerDesc::from_filename(
|
||||
timeline.tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
@@ -171,7 +143,6 @@ impl Layer {
|
||||
let owner = Layer(Arc::new(LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
None,
|
||||
@@ -188,7 +159,6 @@ impl Layer {
|
||||
pub(crate) fn for_resident(
|
||||
conf: &'static PageServerConf,
|
||||
timeline: &Arc<Timeline>,
|
||||
local_path: Utf8PathBuf,
|
||||
file_name: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
) -> ResidentLayer {
|
||||
@@ -214,7 +184,6 @@ impl Layer {
|
||||
LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
Some(inner),
|
||||
@@ -256,19 +225,9 @@ impl Layer {
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&timeline.tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&desc.filename(),
|
||||
&timeline.generation,
|
||||
);
|
||||
|
||||
LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
access_stats,
|
||||
desc,
|
||||
Some(inner),
|
||||
@@ -451,13 +410,6 @@ impl Layer {
|
||||
self.0.metadata()
|
||||
}
|
||||
|
||||
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
|
||||
self.0
|
||||
.timeline
|
||||
.upgrade()
|
||||
.map(|timeline| timeline.timeline_id)
|
||||
}
|
||||
|
||||
/// Traditional debug dumping facility
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
@@ -757,17 +709,19 @@ impl Drop for LayerInner {
|
||||
}
|
||||
|
||||
impl LayerInner {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline: &Arc<Timeline>,
|
||||
local_path: Utf8PathBuf,
|
||||
access_stats: LayerAccessStats,
|
||||
desc: PersistentLayerDesc,
|
||||
downloaded: Option<Arc<DownloadedLayer>>,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Self {
|
||||
let path = conf
|
||||
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
|
||||
.join(desc.filename().to_string());
|
||||
|
||||
let (inner, version, init_status) = if let Some(inner) = downloaded {
|
||||
let version = inner.version;
|
||||
let resident = ResidentOrWantedEvicted::Resident(inner);
|
||||
@@ -783,7 +737,7 @@ impl LayerInner {
|
||||
LayerInner {
|
||||
conf,
|
||||
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
|
||||
path: local_path,
|
||||
path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
have_remote_client: timeline.remote_client.is_some(),
|
||||
@@ -1843,23 +1797,25 @@ impl ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the amount of keys and values written to the writer.
|
||||
pub(crate) async fn copy_delta_prefix(
|
||||
/// FIXME: truncate is bad name because we are not truncating anything, but copying the
|
||||
/// filtered parts.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn copy_delta_prefix(
|
||||
&self,
|
||||
writer: &mut super::delta_layer::DeltaLayerWriter,
|
||||
until: Lsn,
|
||||
truncate_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
) -> anyhow::Result<()> {
|
||||
use LayerKind::*;
|
||||
|
||||
let owner = &self.owner.0;
|
||||
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
Delta(ref d) => d
|
||||
.copy_prefix(writer, until, ctx)
|
||||
.copy_prefix(writer, truncate_at, ctx)
|
||||
.await
|
||||
.with_context(|| format!("copy_delta_prefix until {until} of {self}")),
|
||||
Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
|
||||
.with_context(|| format!("truncate {self}")),
|
||||
Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! such as compaction and GC
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -10,11 +9,9 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use rand::Rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{backoff, completion};
|
||||
@@ -47,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
|
||||
Compaction,
|
||||
Gc,
|
||||
Eviction,
|
||||
IngestHouseKeeping,
|
||||
ConsumptionMetricsCollectMetrics,
|
||||
ConsumptionMetricsSyntheticSizeWorker,
|
||||
InitialLogicalSizeCalculation,
|
||||
@@ -136,30 +132,6 @@ pub fn start_background_loops(
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::IngestHousekeeping,
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
|
||||
false,
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
};
|
||||
ingest_housekeeping_loop(tenant, cancel)
|
||||
.instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
///
|
||||
@@ -407,61 +379,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
}
|
||||
|
||||
// We run ingest housekeeping with the same frequency as compaction: it is not worth
|
||||
// having a distinct setting. But we don't run it in the same task, because compaction
|
||||
// blocks on acquiring the background job semaphore.
|
||||
let period = tenant.get_compaction_period();
|
||||
|
||||
// If compaction period is set to zero (to disable it), then we will use a reasonable default
|
||||
let period = if period == Duration::ZERO {
|
||||
humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
|
||||
.unwrap()
|
||||
.into()
|
||||
} else {
|
||||
period
|
||||
};
|
||||
|
||||
// Jitter the period by +/- 5%
|
||||
let period =
|
||||
rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
|
||||
|
||||
// Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
|
||||
// a tenant, since it won't have started writing any ephemeral files yet.
|
||||
if tokio::time::timeout(period, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
tenant.ingest_housekeeping().await;
|
||||
|
||||
warn_when_period_overrun(
|
||||
started_at.elapsed(),
|
||||
period,
|
||||
BackgroundLoopKind::IngestHouseKeeping,
|
||||
);
|
||||
}
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
// if the tenant has a proper status already, no need to wait for anything
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
@@ -503,6 +420,8 @@ pub(crate) async fn random_init_delay(
|
||||
period: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Cancelled> {
|
||||
use rand::Rng;
|
||||
|
||||
if period == Duration::ZERO {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
mod compaction;
|
||||
pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
mod eviction_task;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
@@ -23,9 +22,8 @@ use pageserver_api::{
|
||||
},
|
||||
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
|
||||
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
|
||||
TimelineState,
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, ShardNumber, TenantShardId},
|
||||
@@ -60,7 +58,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
@@ -865,13 +862,9 @@ impl Timeline {
|
||||
// Initialise the reconstruct state for the key with the cache
|
||||
// entry returned above.
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
|
||||
// Only add the cached image to the reconstruct state when it exists.
|
||||
if cached_page_img.is_some() {
|
||||
let mut key_state = VectoredValueReconstructState::default();
|
||||
key_state.img = cached_page_img;
|
||||
reconstruct_state.keys.insert(key, Ok(key_state));
|
||||
}
|
||||
let mut key_state = VectoredValueReconstructState::default();
|
||||
key_state.img = cached_page_img;
|
||||
reconstruct_state.keys.insert(key, Ok(key_state));
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
@@ -1083,7 +1076,7 @@ impl Timeline {
|
||||
// We should generalize this into Keyspace::contains in the future.
|
||||
for range in &keyspace.ranges {
|
||||
if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
|
||||
|| range.end.field1 > METADATA_KEY_END_PREFIX
|
||||
|| range.end.field1 >= METADATA_KEY_END_PREFIX
|
||||
{
|
||||
return Err(GetVectoredError::Other(anyhow::anyhow!(
|
||||
"only metadata keyspace can be scanned"
|
||||
@@ -1501,21 +1494,15 @@ impl Timeline {
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
self.freeze_and_flush0().await
|
||||
}
|
||||
|
||||
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
|
||||
// polluting the span hierarchy.
|
||||
pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
|
||||
let to_lsn = self.freeze_inmem_layer(false).await;
|
||||
self.flush_frozen_layers_and_wait(to_lsn).await
|
||||
}
|
||||
|
||||
// Check if an open ephemeral layer should be closed: this provides
|
||||
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
|
||||
// an ephemeral layer open forever when idle. It also freezes layers if the global limit on
|
||||
// ephemeral layer bytes has been breached.
|
||||
pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
|
||||
///
|
||||
/// This is for use in background housekeeping, to provide guarantees of layers closing eventually
|
||||
/// even if there are no ongoing writes to drive that.
|
||||
async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
let Ok(_write_guard) = self.write_lock.try_lock() else {
|
||||
// If the write lock is held, there is an active wal receiver: rolling open layers
|
||||
// is their responsibility while they hold this lock.
|
||||
@@ -1542,11 +1529,13 @@ impl Timeline {
|
||||
// we are a sharded tenant and have skipped some WAL
|
||||
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
|
||||
if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
|
||||
// Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard
|
||||
// without any data ingested (yet) doesn't write a remote index as soon as it
|
||||
// This should be somewhat rare, so we log it at INFO level.
|
||||
//
|
||||
// We checked for checkpoint timeout so that a shard without any
|
||||
// data ingested (yet) doesn't write a remote index as soon as it
|
||||
// sees its LSN advance: we only do this if we've been layer-less
|
||||
// for some time.
|
||||
tracing::debug!(
|
||||
tracing::info!(
|
||||
"Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
|
||||
disk_consistent_lsn,
|
||||
last_record_lsn
|
||||
@@ -1636,6 +1625,11 @@ impl Timeline {
|
||||
(guard, permit)
|
||||
};
|
||||
|
||||
// Prior to compaction, check if an open ephemeral layer should be closed: this provides
|
||||
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
|
||||
// an ephemeral layer open forever when idle.
|
||||
self.maybe_freeze_ephemeral_layer().await;
|
||||
|
||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let (_guard, _permit) = tokio::select! {
|
||||
@@ -1905,7 +1899,7 @@ impl Timeline {
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
pub(crate) async fn download_layer(
|
||||
&self,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_file_name: &str,
|
||||
) -> anyhow::Result<Option<bool>> {
|
||||
let Some(layer) = self.find_layer(layer_file_name).await else {
|
||||
return Ok(None);
|
||||
@@ -1923,10 +1917,7 @@ impl Timeline {
|
||||
/// Evict just one layer.
|
||||
///
|
||||
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
|
||||
pub(crate) async fn evict_layer(
|
||||
&self,
|
||||
layer_file_name: &LayerFileName,
|
||||
) -> anyhow::Result<Option<bool>> {
|
||||
pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
let _gate = self
|
||||
.gate
|
||||
.enter()
|
||||
@@ -2000,12 +1991,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
||||
|
||||
// Private functions
|
||||
impl Timeline {
|
||||
pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
|
||||
.switch_to_aux_file_v2
|
||||
.unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
|
||||
}
|
||||
|
||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||
@@ -2417,8 +2409,8 @@ impl Timeline {
|
||||
|
||||
for discovered in discovered {
|
||||
let (name, kind) = match discovered {
|
||||
Discovered::Layer(layer_file_name, local_path, file_size) => {
|
||||
discovered_layers.push((layer_file_name, local_path, file_size));
|
||||
Discovered::Layer(file_name, file_size) => {
|
||||
discovered_layers.push((file_name, file_size));
|
||||
continue;
|
||||
}
|
||||
Discovered::Metadata => {
|
||||
@@ -2463,7 +2455,7 @@ impl Timeline {
|
||||
let mut needs_cleanup = Vec::new();
|
||||
let mut total_physical_size = 0;
|
||||
|
||||
for (name, local_path, decision) in decided {
|
||||
for (name, decision) in decided {
|
||||
let decision = match decision {
|
||||
Ok(UseRemote { local, remote }) => {
|
||||
// Remote is authoritative, but we may still choose to retain
|
||||
@@ -2473,23 +2465,26 @@ impl Timeline {
|
||||
// the correct generation.
|
||||
UseLocal(remote)
|
||||
} else {
|
||||
let local_path = local_path.as_ref().expect("Locally found layer must have path");
|
||||
init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
|
||||
path.pop();
|
||||
UseRemote { local, remote }
|
||||
}
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(DismissedLayer::Future { local }) => {
|
||||
if local.is_some() {
|
||||
let local_path = local_path.expect("Locally found layer must have path");
|
||||
init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
|
||||
path.push(name.file_name());
|
||||
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
|
||||
path.pop();
|
||||
}
|
||||
needs_cleanup.push(name);
|
||||
continue;
|
||||
}
|
||||
Err(DismissedLayer::LocalOnly(local)) => {
|
||||
let local_path = local_path.expect("Locally found layer must have path");
|
||||
init::cleanup_local_only_file(&local_path, &name, &local)?;
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_only_file(&path, &name, &local)?;
|
||||
path.pop();
|
||||
// this file never existed remotely, we will have to do rework
|
||||
continue;
|
||||
}
|
||||
@@ -2505,18 +2500,7 @@ impl Timeline {
|
||||
let layer = match decision {
|
||||
UseLocal(m) => {
|
||||
total_physical_size += m.file_size();
|
||||
|
||||
let local_path = local_path.unwrap_or_else(|| {
|
||||
local_layer_path(
|
||||
conf,
|
||||
&this.tenant_shard_id,
|
||||
&this.timeline_id,
|
||||
&name,
|
||||
&m.generation,
|
||||
)
|
||||
});
|
||||
|
||||
Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
|
||||
Layer::for_resident(conf, &this, name, m).drop_eviction_guard()
|
||||
}
|
||||
Evicted(remote) | UseRemote { remote, .. } => {
|
||||
Layer::for_evicted(conf, &this, name, remote)
|
||||
@@ -2997,11 +2981,11 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
|
||||
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
|
||||
let guard = self.layers.read().await;
|
||||
for historic_layer in guard.layer_map().iter_historic_layers() {
|
||||
let historic_layer_name = historic_layer.filename();
|
||||
if layer_name == &historic_layer_name {
|
||||
let historic_layer_name = historic_layer.filename().file_name();
|
||||
if layer_file_name == historic_layer_name {
|
||||
return Some(guard.get_from_desc(&historic_layer));
|
||||
}
|
||||
}
|
||||
@@ -3031,7 +3015,7 @@ impl Timeline {
|
||||
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().filename(),
|
||||
(&layer.metadata()).into(),
|
||||
layer.metadata().into(),
|
||||
last_activity_ts,
|
||||
)
|
||||
});
|
||||
@@ -3533,7 +3517,7 @@ impl Timeline {
|
||||
Ok(ancestor)
|
||||
}
|
||||
|
||||
pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
|
||||
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
|
||||
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
|
||||
format!(
|
||||
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
|
||||
@@ -4349,49 +4333,6 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => {}
|
||||
)
|
||||
}
|
||||
|
||||
/// Detach this timeline from its ancestor by copying all of ancestors layers as this
|
||||
/// Timelines layers up to the ancestor_lsn.
|
||||
///
|
||||
/// Requires a timeline that:
|
||||
/// - has an ancestor to detach from
|
||||
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
|
||||
/// a technical requirement
|
||||
/// - has prev_lsn in remote storage (temporary restriction)
|
||||
///
|
||||
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
|
||||
/// polled again until completion.
|
||||
///
|
||||
/// During the operation all timelines sharing the data with this timeline will be reparented
|
||||
/// from our ancestor to be branches of this timeline.
|
||||
pub(crate) async fn prepare_to_detach_from_ancestor(
|
||||
self: &Arc<Timeline>,
|
||||
tenant: &crate::tenant::Tenant,
|
||||
options: detach_ancestor::Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<
|
||||
(
|
||||
completion::Completion,
|
||||
detach_ancestor::PreparedTimelineDetach,
|
||||
),
|
||||
detach_ancestor::Error,
|
||||
> {
|
||||
detach_ancestor::prepare(self, tenant, options, ctx).await
|
||||
}
|
||||
|
||||
/// Completes the ancestor detach. This method is to be called while holding the
|
||||
/// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
|
||||
/// timeline be deleted. After this method returns successfully, tenant must be reloaded.
|
||||
///
|
||||
/// Pageserver receiving a SIGKILL during this operation is not supported (yet).
|
||||
pub(crate) async fn complete_detaching_timeline_ancestor(
|
||||
self: &Arc<Timeline>,
|
||||
tenant: &crate::tenant::Tenant,
|
||||
prepared: detach_ancestor::PreparedTimelineDetach,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
detach_ancestor::complete(self, tenant, prepared, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
@@ -4500,24 +4441,6 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn rewrite_layers(
|
||||
self: &Arc<Self>,
|
||||
replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
drop_layers: Vec<Layer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
|
||||
|
||||
let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
|
||||
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Schedules the uploads of the given image layers
|
||||
fn upload_new_image_layers(
|
||||
self: &Arc<Self>,
|
||||
@@ -4676,8 +4599,6 @@ impl Timeline {
|
||||
retain_lsns: Vec<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
|
||||
|
||||
let now = SystemTime::now();
|
||||
let mut result: GcResult = GcResult::default();
|
||||
|
||||
|
||||
@@ -15,8 +15,7 @@ use anyhow::{anyhow, Context};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
@@ -94,7 +93,7 @@ impl Timeline {
|
||||
// Define partitioning schema if needed
|
||||
|
||||
// FIXME: the match should only cover repartitioning, not the next steps
|
||||
let partition_count = match self
|
||||
match self
|
||||
.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
self.get_compaction_target_size(),
|
||||
@@ -147,7 +146,6 @@ impl Timeline {
|
||||
assert!(sparse_layers.is_empty());
|
||||
|
||||
self.upload_new_image_layers(dense_layers)?;
|
||||
dense_partitioning.parts.len()
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -159,150 +157,9 @@ impl Timeline {
|
||||
if !self.cancel.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
1
|
||||
}
|
||||
};
|
||||
|
||||
if self.shard_identity.count >= ShardCount::new(2) {
|
||||
// Limit the number of layer rewrites to the number of partitions: this means its
|
||||
// runtime should be comparable to a full round of image layer creations, rather than
|
||||
// being potentially much longer.
|
||||
let rewrite_max = partition_count;
|
||||
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check for layers that are elegible to be rewritten:
|
||||
/// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
|
||||
/// we don't indefinitely retain keys in this shard that aren't needed.
|
||||
/// - For future use: layers beyond pitr_interval that are in formats we would
|
||||
/// rather not maintain compatibility with indefinitely.
|
||||
///
|
||||
/// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
|
||||
/// how much work it will try to do in each compaction pass.
|
||||
async fn compact_shard_ancestors(
|
||||
self: &Arc<Self>,
|
||||
rewrite_max: usize,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut drop_layers = Vec::new();
|
||||
let layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
|
||||
// We will use the PITR cutoff as a condition for rewriting layers.
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map().iter_historic_layers() {
|
||||
let layer = layers.get_from_desc(&layer_desc);
|
||||
if layer.metadata().shard.shard_count == self.shard_identity.count {
|
||||
// This layer does not belong to a historic ancestor, no need to re-image it.
|
||||
continue;
|
||||
}
|
||||
|
||||
// This layer was created on an ancestor shard: check if it contains any data for this shard.
|
||||
let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity);
|
||||
let layer_local_page_count = sharded_range.page_count();
|
||||
let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range());
|
||||
if layer_local_page_count == 0 {
|
||||
// This ancestral layer only covers keys that belong to other shards.
|
||||
// We include the full metadata in the log: if we had some critical bug that caused
|
||||
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard.",
|
||||
);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
|
||||
// wrong. If ShardedRange claims the local page count is zero, then no keys in this layer
|
||||
// should be !is_key_disposable()
|
||||
let range = layer_desc.get_key_range();
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
debug_assert!(self.shard_identity.is_key_disposable(&key));
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
drop_layers.push(layer);
|
||||
continue;
|
||||
} else if layer_local_page_count != u32::MAX
|
||||
&& layer_local_page_count == layer_raw_page_count
|
||||
{
|
||||
debug!(%layer,
|
||||
"layer is entirely shard local ({} keys), no need to filter it",
|
||||
layer_local_page_count
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't bother re-writing a layer unless it will at least halve its size
|
||||
if layer_local_page_count != u32::MAX
|
||||
&& layer_local_page_count > layer_raw_page_count / 2
|
||||
{
|
||||
debug!(%layer,
|
||||
"layer is already mostly local ({}/{}), not rewriting",
|
||||
layer_local_page_count,
|
||||
layer_raw_page_count
|
||||
);
|
||||
}
|
||||
|
||||
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
|
||||
// without incurring the I/O cost of a rewrite.
|
||||
if layer_desc.get_lsn_range().end >= pitr_cutoff {
|
||||
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
|
||||
layer_desc.get_lsn_range().end, pitr_cutoff);
|
||||
continue;
|
||||
}
|
||||
|
||||
if layer_desc.is_delta() {
|
||||
// We do not yet implement rewrite of delta layers
|
||||
debug!(%layer, "Skipping rewrite of delta layer");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only rewrite layers if they would have different remote paths: either they belong to this
|
||||
// shard but an old generation, or they belonged to another shard. This also implicitly
|
||||
// guarantees that the layer is persistent in remote storage (as only remote persistent
|
||||
// layers are carried across shard splits, any local-only layer would be in the current generation)
|
||||
if layer.metadata().generation == self.generation
|
||||
&& layer.metadata().shard.shard_count == self.shard_identity.count
|
||||
{
|
||||
debug!(%layer, "Skipping rewrite, is not from old generation");
|
||||
continue;
|
||||
}
|
||||
|
||||
if layers_to_rewrite.len() >= rewrite_max {
|
||||
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
layers_to_rewrite.len()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Fall through: all our conditions for doing a rewrite passed.
|
||||
// TODO: implement rewriting
|
||||
tracing::debug!(%layer, "Would rewrite layer");
|
||||
}
|
||||
|
||||
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
|
||||
drop(layers);
|
||||
|
||||
// TODO: collect layers to rewrite
|
||||
let replace_layers = Vec::new();
|
||||
|
||||
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
|
||||
self.rewrite_layers(replace_layers, drop_layers).await?;
|
||||
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
// We wait for all uploads to complete before finishing this compaction stage. This is not
|
||||
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
|
||||
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
|
||||
// load.
|
||||
remote_client.wait_completion().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -422,10 +422,6 @@ impl DeleteTimelineFlow {
|
||||
pub(crate) fn is_finished(&self) -> bool {
|
||||
matches!(self, Self::Finished)
|
||||
}
|
||||
|
||||
pub(crate) fn is_not_started(&self) -> bool {
|
||||
matches!(self, Self::NotStarted)
|
||||
}
|
||||
}
|
||||
|
||||
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
|
||||
|
||||
@@ -1,550 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{layer_manager::LayerManager, Timeline};
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
tenant::{
|
||||
storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
|
||||
Tenant,
|
||||
},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum Error {
|
||||
#[error("no ancestors")]
|
||||
NoAncestor,
|
||||
#[error("too many ancestors")]
|
||||
TooManyAncestors,
|
||||
#[error("shutting down, please retry later")]
|
||||
ShuttingDown,
|
||||
#[error("detached timeline must receive writes before the operation")]
|
||||
DetachedTimelineNeedsWrites,
|
||||
#[error("flushing failed")]
|
||||
FlushAncestor(#[source] anyhow::Error),
|
||||
#[error("layer download failed")]
|
||||
RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
|
||||
#[error("copying LSN prefix locally failed")]
|
||||
CopyDeltaPrefix(#[source] anyhow::Error),
|
||||
#[error("upload rewritten layer")]
|
||||
UploadRewritten(#[source] anyhow::Error),
|
||||
|
||||
#[error("ancestor is already being detached by: {}", .0)]
|
||||
OtherTimelineDetachOngoing(TimelineId),
|
||||
|
||||
#[error("remote copying layer failed")]
|
||||
CopyFailed(#[source] anyhow::Error),
|
||||
|
||||
#[error("unexpected error")]
|
||||
Unexpected(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
pub(crate) struct PreparedTimelineDetach {
|
||||
layers: Vec<Layer>,
|
||||
}
|
||||
|
||||
/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Options {
|
||||
pub(crate) rewrite_concurrency: std::num::NonZeroUsize,
|
||||
pub(crate) copy_concurrency: std::num::NonZeroUsize,
|
||||
}
|
||||
|
||||
impl Default for Options {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
|
||||
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Timeline::prepare_to_detach_from_ancestor`]
|
||||
pub(super) async fn prepare(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
options: Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
|
||||
use Error::*;
|
||||
|
||||
if detached.remote_client.as_ref().is_none() {
|
||||
unimplemented!("no new code for running without remote storage");
|
||||
}
|
||||
|
||||
let Some((ancestor, ancestor_lsn)) = detached
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|tl| (tl.clone(), detached.ancestor_lsn))
|
||||
else {
|
||||
return Err(NoAncestor);
|
||||
};
|
||||
|
||||
if !ancestor_lsn.is_valid() {
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
|
||||
if ancestor.ancestor_timeline.is_some() {
|
||||
// non-technical requirement; we could flatten N ancestors just as easily but we chose
|
||||
// not to
|
||||
return Err(TooManyAncestors);
|
||||
}
|
||||
|
||||
if detached.get_prev_record_lsn() == Lsn::INVALID
|
||||
|| detached.disk_consistent_lsn.load() == ancestor_lsn
|
||||
{
|
||||
// this is to avoid a problem that after detaching we would be unable to start up the
|
||||
// compute because of "PREV_LSN: invalid".
|
||||
return Err(DetachedTimelineNeedsWrites);
|
||||
}
|
||||
|
||||
// before we acquire the gate, we must mark the ancestor as having a detach operation
|
||||
// ongoing which will block other concurrent detach operations so we don't get to ackward
|
||||
// situations where there would be two branches trying to reparent earlier branches.
|
||||
let (guard, barrier) = completion::channel();
|
||||
|
||||
{
|
||||
let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
|
||||
if let Some((tl, other)) = guard.as_ref() {
|
||||
if !other.is_ready() {
|
||||
return Err(OtherTimelineDetachOngoing(*tl));
|
||||
}
|
||||
}
|
||||
*guard = Some((detached.timeline_id, barrier));
|
||||
}
|
||||
|
||||
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
|
||||
|
||||
if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
|
||||
let span =
|
||||
tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
|
||||
async {
|
||||
let started_at = std::time::Instant::now();
|
||||
let freeze_and_flush = ancestor.freeze_and_flush0();
|
||||
let mut freeze_and_flush = std::pin::pin!(freeze_and_flush);
|
||||
|
||||
let res =
|
||||
tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush)
|
||||
.await;
|
||||
|
||||
let res = match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => {
|
||||
tracing::info!("freezing and flushing ancestor is still ongoing");
|
||||
freeze_and_flush.await
|
||||
}
|
||||
};
|
||||
|
||||
res.map_err(FlushAncestor)?;
|
||||
|
||||
// we do not need to wait for uploads to complete but we do need `struct Layer`,
|
||||
// copying delta prefix is unsupported currently for `InMemoryLayer`.
|
||||
tracing::info!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"froze and flushed the ancestor"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
.instrument(span)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let end_lsn = ancestor_lsn + 1;
|
||||
|
||||
let (filtered_layers, straddling_branchpoint, rest_of_historic) = {
|
||||
// we do not need to start from our layers, because they can only be layers that come
|
||||
// *after* ancestor_lsn
|
||||
let layers = tokio::select! {
|
||||
guard = ancestor.layers.read() => guard,
|
||||
_ = detached.cancel.cancelled() => {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
_ = ancestor.cancel.cancelled() => {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
};
|
||||
|
||||
// between retries, these can change if compaction or gc ran in between. this will mean
|
||||
// we have to redo work.
|
||||
partition_work(ancestor_lsn, &layers)
|
||||
};
|
||||
|
||||
// TODO: layers are already sorted by something: use that to determine how much of remote
|
||||
// copies are already done.
|
||||
tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
|
||||
|
||||
// TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
|
||||
let mut new_layers: Vec<Layer> =
|
||||
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
|
||||
|
||||
{
|
||||
tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
let mut wrote_any = false;
|
||||
|
||||
let limiter = Arc::new(tokio::sync::Semaphore::new(
|
||||
options.rewrite_concurrency.get(),
|
||||
));
|
||||
|
||||
for layer in straddling_branchpoint {
|
||||
let limiter = limiter.clone();
|
||||
let timeline = detached.clone();
|
||||
let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
|
||||
tasks.spawn(async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let copied =
|
||||
upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
|
||||
.await?;
|
||||
Ok(copied)
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Ok(Some(copied))) => {
|
||||
wrote_any = true;
|
||||
tracing::info!(layer=%copied, "rewrote and uploaded");
|
||||
new_layers.push(copied);
|
||||
}
|
||||
Ok(Ok(None)) => {}
|
||||
Ok(Err(e)) => return Err(e),
|
||||
Err(je) => return Err(Unexpected(je.into())),
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: the fsync should be mandatory, after both rewrites and copies
|
||||
if wrote_any {
|
||||
let timeline_dir = VirtualFile::open(
|
||||
&detached
|
||||
.conf
|
||||
.timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
|
||||
)
|
||||
.await
|
||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
||||
timeline_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
}
|
||||
}
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
|
||||
|
||||
for adopted in rest_of_historic {
|
||||
let limiter = limiter.clone();
|
||||
let timeline = detached.clone();
|
||||
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let owned =
|
||||
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
|
||||
tracing::info!(layer=%owned, "remote copied");
|
||||
Ok(owned)
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Ok(owned)) => {
|
||||
new_layers.push(owned);
|
||||
}
|
||||
Ok(Err(failed)) => {
|
||||
return Err(failed);
|
||||
}
|
||||
Err(je) => return Err(Unexpected(je.into())),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fsync directory again if we hardlinked something
|
||||
|
||||
let prepared = PreparedTimelineDetach { layers: new_layers };
|
||||
|
||||
Ok((guard, prepared))
|
||||
}
|
||||
|
||||
fn partition_work(
|
||||
ancestor_lsn: Lsn,
|
||||
source_layermap: &LayerManager,
|
||||
) -> (usize, Vec<Layer>, Vec<Layer>) {
|
||||
let mut straddling_branchpoint = vec![];
|
||||
let mut rest_of_historic = vec![];
|
||||
|
||||
let mut later_by_lsn = 0;
|
||||
|
||||
for desc in source_layermap.layer_map().iter_historic_layers() {
|
||||
// off by one chances here:
|
||||
// - start is inclusive
|
||||
// - end is exclusive
|
||||
if desc.lsn_range.start > ancestor_lsn {
|
||||
later_by_lsn += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let target = if desc.lsn_range.start <= ancestor_lsn
|
||||
&& desc.lsn_range.end > ancestor_lsn
|
||||
&& desc.is_delta
|
||||
{
|
||||
// TODO: image layer at Lsn optimization
|
||||
&mut straddling_branchpoint
|
||||
} else {
|
||||
&mut rest_of_historic
|
||||
};
|
||||
|
||||
target.push(source_layermap.get_from_desc(&desc));
|
||||
}
|
||||
|
||||
(later_by_lsn, straddling_branchpoint, rest_of_historic)
|
||||
}
|
||||
|
||||
async fn upload_rewritten_layer(
|
||||
end_lsn: Lsn,
|
||||
layer: &Layer,
|
||||
target: &Arc<Timeline>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Layer>, Error> {
|
||||
use Error::UploadRewritten;
|
||||
let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?;
|
||||
|
||||
let Some(copied) = copied else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
// FIXME: better shuttingdown error
|
||||
target
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.upload_layer_file(&copied, cancel)
|
||||
.await
|
||||
.map_err(UploadRewritten)?;
|
||||
|
||||
Ok(Some(copied.into()))
|
||||
}
|
||||
|
||||
async fn copy_lsn_prefix(
|
||||
end_lsn: Lsn,
|
||||
layer: &Layer,
|
||||
target_timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<ResidentLayer>, Error> {
|
||||
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
|
||||
|
||||
tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
|
||||
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
target_timeline.conf,
|
||||
target_timeline.timeline_id,
|
||||
target_timeline.tenant_shard_id,
|
||||
layer.layer_desc().key_range.start,
|
||||
layer.layer_desc().lsn_range.start..end_lsn,
|
||||
)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
let resident = layer
|
||||
.download_and_keep_resident()
|
||||
.await
|
||||
// likely shutdown
|
||||
.map_err(RewrittenDeltaDownloadFailed)?;
|
||||
|
||||
let records = resident
|
||||
.copy_delta_prefix(&mut writer, end_lsn, ctx)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
drop(resident);
|
||||
|
||||
tracing::debug!(%layer, records, "copied records");
|
||||
|
||||
if records == 0 {
|
||||
drop(writer);
|
||||
// TODO: we might want to store an empty marker in remote storage for this
|
||||
// layer so that we will not needlessly walk `layer` on repeated attempts.
|
||||
Ok(None)
|
||||
} else {
|
||||
// reuse the key instead of adding more holes between layers by using the real
|
||||
// highest key in the layer.
|
||||
let reused_highest_key = layer.layer_desc().key_range.end;
|
||||
let copied = writer
|
||||
.finish(reused_highest_key, target_timeline, ctx)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
tracing::debug!(%layer, %copied, "new layer produced");
|
||||
|
||||
Ok(Some(copied))
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote
|
||||
/// storage on successful return without the adopted layer being added to `index_part.json`.
|
||||
async fn remote_copy(
|
||||
adopted: &Layer,
|
||||
adoptee: &Arc<Timeline>,
|
||||
generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Layer, Error> {
|
||||
use Error::CopyFailed;
|
||||
|
||||
// depending if Layer::keep_resident we could hardlink
|
||||
|
||||
let mut metadata = adopted.metadata();
|
||||
debug_assert!(metadata.generation <= generation);
|
||||
metadata.generation = generation;
|
||||
|
||||
let owned = crate::tenant::storage_layer::Layer::for_evicted(
|
||||
adoptee.conf,
|
||||
adoptee,
|
||||
adopted.layer_desc().filename(),
|
||||
metadata,
|
||||
);
|
||||
|
||||
// FIXME: better shuttingdown error
|
||||
adoptee
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.copy_timeline_layer(adopted, &owned, cancel)
|
||||
.await
|
||||
.map(move |()| owned)
|
||||
.map_err(CopyFailed)
|
||||
}
|
||||
|
||||
/// See [`Timeline::complete_detaching_timeline_ancestor`].
|
||||
pub(super) async fn complete(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
prepared: PreparedTimelineDetach,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
let rtc = detached
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.expect("has to have a remote timeline client for timeline ancestor detach");
|
||||
|
||||
let PreparedTimelineDetach { layers } = prepared;
|
||||
|
||||
let ancestor = detached
|
||||
.get_ancestor_timeline()
|
||||
.expect("must still have a ancestor");
|
||||
let ancestor_lsn = detached.get_ancestor_lsn();
|
||||
|
||||
// publish the prepared layers before we reparent any of the timelines, so that on restart
|
||||
// reparented timelines find layers. also do the actual detaching.
|
||||
//
|
||||
// if we crash after this operation, we will at least come up having detached a timeline, but
|
||||
// we cannot go back and reparent the timelines which would had been reparented in normal
|
||||
// execution.
|
||||
//
|
||||
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
|
||||
// which could give us a completely wrong layer combination.
|
||||
rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
|
||||
&layers,
|
||||
(ancestor.timeline_id, ancestor_lsn),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
// because we are now keeping the slot in progress, it is unlikely that there will be any
|
||||
// timeline deletions during this time. if we raced one, then we'll just ignore it.
|
||||
tenant
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter_map(|tl| {
|
||||
if Arc::ptr_eq(tl, detached) {
|
||||
return None;
|
||||
}
|
||||
|
||||
if !tl.is_active() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let tl_ancestor = tl.ancestor_timeline.as_ref()?;
|
||||
let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
|
||||
let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
|
||||
|
||||
let is_deleting = tl
|
||||
.delete_progress
|
||||
.try_lock()
|
||||
.map(|flow| !flow.is_not_started())
|
||||
.unwrap_or(true);
|
||||
|
||||
if is_same && is_earlier && !is_deleting {
|
||||
Some(tl.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.for_each(|timeline| {
|
||||
// important in this scope: we are holding the Tenant::timelines lock
|
||||
let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
|
||||
let new_parent = detached.timeline_id;
|
||||
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let res = timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.expect("reparented has to have remote client because detached has one")
|
||||
.schedule_reparenting_and_wait(&new_parent)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(()) => Some(timeline),
|
||||
Err(e) => {
|
||||
// with the use of tenant slot, we no longer expect these.
|
||||
tracing::warn!("reparenting failed: {e:#}");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(span),
|
||||
);
|
||||
});
|
||||
|
||||
let reparenting_candidates = tasks.len();
|
||||
let mut reparented = Vec::with_capacity(tasks.len());
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Some(timeline)) => {
|
||||
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
|
||||
reparented.push(timeline.timeline_id);
|
||||
}
|
||||
Ok(None) => {
|
||||
// lets just ignore this for now. one or all reparented timelines could had
|
||||
// started deletion, and that is fine.
|
||||
}
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => {
|
||||
// ignore; it's better to continue with a single reparenting failing (or even
|
||||
// all of them) in order to get to the goal state.
|
||||
//
|
||||
// these timelines will never be reparentable, but they can be always detached as
|
||||
// separate tree roots.
|
||||
}
|
||||
Err(je) => tracing::error!("unexpected join error: {je:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
if reparenting_candidates != reparented.len() {
|
||||
tracing::info!("failed to reparent some candidates");
|
||||
}
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
METADATA_FILE_NAME,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino::Utf8Path;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
|
||||
/// Identified files in the timeline directory.
|
||||
pub(super) enum Discovered {
|
||||
/// The only one we care about
|
||||
Layer(LayerFileName, Utf8PathBuf, u64),
|
||||
Layer(LayerFileName, u64),
|
||||
/// Old ephmeral files from previous launches, should be removed
|
||||
Ephemeral(String),
|
||||
/// Old temporary timeline files, unsure what these really are, should be removed
|
||||
@@ -46,7 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
|
||||
let discovered = match LayerFileName::from_str(&file_name) {
|
||||
Ok(file_name) => {
|
||||
let file_size = direntry.metadata()?.len();
|
||||
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
|
||||
Discovered::Layer(file_name, file_size)
|
||||
}
|
||||
Err(_) => {
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
@@ -104,38 +104,26 @@ pub(super) enum DismissedLayer {
|
||||
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
pub(super) fn reconcile(
|
||||
discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
|
||||
discovered: Vec<(LayerFileName, u64)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
generation: Generation,
|
||||
shard: ShardIndex,
|
||||
) -> Vec<(
|
||||
LayerFileName,
|
||||
Option<Utf8PathBuf>,
|
||||
Result<Decision, DismissedLayer>,
|
||||
)> {
|
||||
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
|
||||
use Decision::*;
|
||||
|
||||
// name => (local_path, local_metadata, remote_metadata)
|
||||
type Collected = HashMap<
|
||||
LayerFileName,
|
||||
(
|
||||
Option<Utf8PathBuf>,
|
||||
Option<LayerFileMetadata>,
|
||||
Option<LayerFileMetadata>,
|
||||
),
|
||||
>;
|
||||
// name => (local, remote)
|
||||
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
|
||||
|
||||
let mut discovered = discovered
|
||||
.into_iter()
|
||||
.map(|(layer_name, local_path, file_size)| {
|
||||
.map(|(name, file_size)| {
|
||||
(
|
||||
layer_name,
|
||||
name,
|
||||
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
|
||||
// it is not in IndexPart, in which case using our current generation makes sense
|
||||
// because it will be uploaded in this generation.
|
||||
(
|
||||
Some(local_path),
|
||||
Some(LayerFileMetadata::new(file_size, generation, shard)),
|
||||
None,
|
||||
),
|
||||
@@ -152,15 +140,15 @@ pub(super) fn reconcile(
|
||||
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
|
||||
.for_each(|(name, metadata)| {
|
||||
if let Some(existing) = discovered.get_mut(name) {
|
||||
existing.2 = Some(metadata);
|
||||
existing.1 = Some(metadata);
|
||||
} else {
|
||||
discovered.insert(name.to_owned(), (None, None, Some(metadata)));
|
||||
discovered.insert(name.to_owned(), (None, Some(metadata)));
|
||||
}
|
||||
});
|
||||
|
||||
discovered
|
||||
.into_iter()
|
||||
.map(|(name, (local_path, local, remote))| {
|
||||
.map(|(name, (local, remote))| {
|
||||
let decision = if name.is_in_future(disk_consistent_lsn) {
|
||||
Err(DismissedLayer::Future { local })
|
||||
} else {
|
||||
@@ -177,7 +165,7 @@ pub(super) fn reconcile(
|
||||
}
|
||||
};
|
||||
|
||||
(name, local_path, decision)
|
||||
(name, decision)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
@@ -205,24 +205,6 @@ impl LayerManager {
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub(crate) fn rewrite_layers(
|
||||
&mut self,
|
||||
rewrite_layers: &[(Layer, ResidentLayer)],
|
||||
drop_layers: &[Layer],
|
||||
_metrics: &TimelineMetrics,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
|
||||
// TODO: implement rewrites (currently this code path only used for drops)
|
||||
assert!(rewrite_layers.is_empty());
|
||||
|
||||
for l in drop_layers {
|
||||
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when garbage collect has selected the layers to be removed.
|
||||
pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
|
||||
@@ -14,8 +14,7 @@ OBJS = \
|
||||
relsize_cache.o \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
control_plane_connector.o \
|
||||
walsender_hooks.o
|
||||
control_plane_connector.o
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
|
||||
@@ -49,7 +49,7 @@ char *neon_auth_token;
|
||||
int readahead_buffer_size = 128;
|
||||
int flush_every_n_requests = 8;
|
||||
|
||||
int neon_protocol_version = 2;
|
||||
int neon_protocol_version = 1;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
|
||||
"Version of compute<->page server protocol",
|
||||
NULL,
|
||||
&neon_protocol_version,
|
||||
2, /* use protocol version 2 */
|
||||
1, /* default to old protocol for now */
|
||||
1, /* min */
|
||||
2, /* max */
|
||||
PGC_SU_BACKEND,
|
||||
|
||||
@@ -34,7 +34,6 @@
|
||||
#include "walproposer.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "control_plane_connector.h"
|
||||
#include "walsender_hooks.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
void _PG_init(void);
|
||||
@@ -266,6 +265,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
_PG_init(void)
|
||||
{
|
||||
@@ -279,7 +279,6 @@ _PG_init(void)
|
||||
|
||||
pg_init_libpagestore();
|
||||
pg_init_walproposer();
|
||||
WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitLogicalReplicationMonitor();
|
||||
|
||||
|
||||
@@ -36,7 +36,10 @@
|
||||
|
||||
static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
|
||||
static void NeonWALReaderResetRemote(NeonWALReader *state);
|
||||
static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
|
||||
static void neon_wal_segment_close(NeonWALReader *state);
|
||||
static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
|
||||
TimeLineID tli);
|
||||
|
||||
@@ -79,9 +82,8 @@ struct NeonWALReader
|
||||
XLogRecPtr req_lsn;
|
||||
Size req_len;
|
||||
Size req_progress;
|
||||
char donor_conninfo[MAXCONNINFO];
|
||||
WalProposer *wp; /* we learn donor through walproposer */
|
||||
char donor_name[64]; /* saved donor safekeeper name for logging */
|
||||
XLogRecPtr donor_lsn;
|
||||
/* state of connection to safekeeper */
|
||||
NeonWALReaderRemoteState rem_state;
|
||||
WalProposerConn *wp_conn;
|
||||
@@ -105,7 +107,7 @@ struct NeonWALReader
|
||||
|
||||
/* palloc and initialize NeonWALReader */
|
||||
NeonWALReader *
|
||||
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix)
|
||||
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
|
||||
{
|
||||
NeonWALReader *reader;
|
||||
|
||||
@@ -121,6 +123,8 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
|
||||
reader->seg.ws_tli = 0;
|
||||
reader->segcxt.ws_segsize = wal_segment_size;
|
||||
|
||||
reader->wp = wp;
|
||||
|
||||
reader->rem_state = RS_NONE;
|
||||
|
||||
if (log_prefix)
|
||||
@@ -200,16 +204,21 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
|
||||
{
|
||||
if (state->rem_state == RS_NONE)
|
||||
{
|
||||
if (!NeonWALReaderUpdateDonor(state))
|
||||
XLogRecPtr donor_lsn;
|
||||
|
||||
/* no connection yet; start one */
|
||||
Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
|
||||
|
||||
if (donor == NULL)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to establish remote connection to fetch WAL: no donor available");
|
||||
return NEON_WALREAD_ERROR;
|
||||
|
||||
}
|
||||
/* no connection yet; start one */
|
||||
nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn));
|
||||
state->wp_conn = libpqwp_connect_start(state->donor_conninfo);
|
||||
snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
|
||||
nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
|
||||
state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
|
||||
state->wp_conn = libpqwp_connect_start(donor->conninfo);
|
||||
if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
@@ -242,22 +251,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
|
||||
{
|
||||
/* connection successfully established */
|
||||
char start_repl_query[128];
|
||||
term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm);
|
||||
|
||||
/*
|
||||
* Set elected walproposer's term to pull only data from
|
||||
* its history. Note: for logical walsender it means we
|
||||
* might stream WAL not yet committed by safekeepers. It
|
||||
* would be cleaner to fix this.
|
||||
*
|
||||
* mineLastElectedTerm shouldn't be 0 at this point
|
||||
* because we checked above that donor exists and it
|
||||
* appears only after successfull election.
|
||||
*/
|
||||
Assert(term > 0);
|
||||
snprintf(start_repl_query, sizeof(start_repl_query),
|
||||
"START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
|
||||
LSN_FORMAT_ARGS(startptr), term);
|
||||
LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
|
||||
nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
|
||||
state->donor_name, start_repl_query);
|
||||
if (!libpqwp_send_query(state->wp_conn, start_repl_query))
|
||||
@@ -407,10 +404,6 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
state->req_len = 0;
|
||||
state->req_progress = 0;
|
||||
|
||||
/* Update the current segment info. */
|
||||
state->seg.ws_tli = tli;
|
||||
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
}
|
||||
@@ -533,7 +526,7 @@ err:
|
||||
}
|
||||
|
||||
/* reset remote connection and request in progress */
|
||||
void
|
||||
static void
|
||||
NeonWALReaderResetRemote(NeonWALReader *state)
|
||||
{
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
@@ -698,25 +691,13 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
|
||||
return true;
|
||||
}
|
||||
|
||||
XLogRecPtr
|
||||
NeonWALReaderGetRemLsn(NeonWALReader *state)
|
||||
{
|
||||
return state->rem_lsn;
|
||||
}
|
||||
|
||||
const WALOpenSegment *
|
||||
NeonWALReaderGetSegment(NeonWALReader *state)
|
||||
{
|
||||
return &state->seg;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy of vanilla wal_segment_open, but returns false in case of error instead
|
||||
* of ERROR, with errno set.
|
||||
*
|
||||
* XLogReaderRoutine->segment_open callback for local pg_wal files
|
||||
*/
|
||||
bool
|
||||
static bool
|
||||
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p)
|
||||
{
|
||||
@@ -743,7 +724,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
|
||||
}
|
||||
|
||||
/* copy of vanilla wal_segment_close with NeonWALReader */
|
||||
void
|
||||
static void
|
||||
neon_wal_segment_close(NeonWALReader *state)
|
||||
{
|
||||
if (state->seg.ws_file >= 0)
|
||||
@@ -759,19 +740,3 @@ NeonWALReaderErrMsg(NeonWALReader *state)
|
||||
{
|
||||
return state->err_msg;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if there is a donor, and false otherwise
|
||||
*/
|
||||
bool
|
||||
NeonWALReaderUpdateDonor(NeonWALReader *state)
|
||||
{
|
||||
WalproposerShmemState *wps = GetWalpropShmemState();
|
||||
|
||||
SpinLockAcquire(&wps->mutex);
|
||||
memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name));
|
||||
memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo));
|
||||
state->donor_lsn = wps->donor_lsn;
|
||||
SpinLockRelease(&wps->mutex);
|
||||
return state->donor_name[0] != '\0';
|
||||
}
|
||||
|
||||
@@ -19,19 +19,12 @@ typedef enum
|
||||
NEON_WALREAD_ERROR,
|
||||
} NeonWALReadResult;
|
||||
|
||||
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix);
|
||||
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
|
||||
extern void NeonWALReaderFree(NeonWALReader *state);
|
||||
extern void NeonWALReaderResetRemote(NeonWALReader *state);
|
||||
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
|
||||
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
|
||||
extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
|
||||
extern char *NeonWALReaderErrMsg(NeonWALReader *state);
|
||||
extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state);
|
||||
extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state);
|
||||
extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
|
||||
extern void neon_wal_segment_close(NeonWALReader *state);
|
||||
extern bool NeonWALReaderUpdateDonor(NeonWALReader *state);
|
||||
|
||||
|
||||
#endif /* __NEON_WALREADER_H__ */
|
||||
|
||||
@@ -80,7 +80,7 @@ static int CompareLsn(const void *a, const void *b);
|
||||
static char *FormatSafekeeperState(Safekeeper *sk);
|
||||
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
static void UpdateDonorShmem(WalProposer *wp);
|
||||
|
||||
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
@@ -922,8 +922,7 @@ static void
|
||||
DetermineEpochStartLsn(WalProposer *wp)
|
||||
{
|
||||
TermHistory *dth;
|
||||
int n_ready = 0;
|
||||
WalproposerShmemState *walprop_shared;
|
||||
int n_ready = 0;
|
||||
|
||||
wp->propEpochStartLsn = InvalidXLogRecPtr;
|
||||
wp->donorEpoch = 0;
|
||||
@@ -965,18 +964,16 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (n_ready < wp->quorum)
|
||||
{
|
||||
/*
|
||||
* This is a rare case that can be triggered if safekeeper has voted
|
||||
* and disconnected. In this case, its state will not be SS_IDLE and
|
||||
* its vote cannot be used, because we clean up `voteResponse` in
|
||||
* `ShutdownConnection`.
|
||||
* This is a rare case that can be triggered if safekeeper has voted and disconnected.
|
||||
* In this case, its state will not be SS_IDLE and its vote cannot be used, because
|
||||
* we clean up `voteResponse` in `ShutdownConnection`.
|
||||
*/
|
||||
wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
|
||||
}
|
||||
|
||||
/*
|
||||
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are
|
||||
* bootstrapping and nothing was committed yet. Start streaming then from
|
||||
* the basebackup LSN.
|
||||
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
|
||||
* and nothing was committed yet. Start streaming then from the basebackup LSN.
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
|
||||
{
|
||||
@@ -987,12 +984,11 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
}
|
||||
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
}
|
||||
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
|
||||
|
||||
/*
|
||||
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so
|
||||
* it should never be zero at this point, if we know timelineStartLsn.
|
||||
*
|
||||
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
|
||||
* should never be zero at this point, if we know timelineStartLsn.
|
||||
*
|
||||
* timelineStartLsn can be zero only on the first syncSafekeepers run.
|
||||
*/
|
||||
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
|
||||
@@ -1026,9 +1022,10 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
* since which we are going to write according to the consensus. If not,
|
||||
* we must bail out, as clog and other non rel data is inconsistent.
|
||||
*/
|
||||
walprop_shared = wp->api.get_shmem_state(wp);
|
||||
if (!wp->config->syncSafekeepers)
|
||||
{
|
||||
WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp);
|
||||
|
||||
/*
|
||||
* Basebackup LSN always points to the beginning of the record (not
|
||||
* the page), as StartupXLOG most probably wants it this way.
|
||||
@@ -1043,7 +1040,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
* compute (who could generate WAL) is ok.
|
||||
*/
|
||||
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
|
||||
pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
|
||||
walprop_shared->mineLastElectedTerm)))
|
||||
{
|
||||
/*
|
||||
* Panic to restart PG as we need to retake basebackup.
|
||||
@@ -1057,8 +1054,8 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||
}
|
||||
}
|
||||
walprop_shared->mineLastElectedTerm = wp->propTerm;
|
||||
}
|
||||
pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1108,13 +1105,9 @@ SendProposerElected(Safekeeper *sk)
|
||||
{
|
||||
/* safekeeper is empty or no common point, start from the beginning */
|
||||
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
|
||||
|
||||
/*
|
||||
* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
|
||||
* is created manually (test_s3_wal_replay)
|
||||
*/
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
|
||||
/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
|
||||
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
|
||||
}
|
||||
else
|
||||
@@ -1184,12 +1177,6 @@ StartStreaming(Safekeeper *sk)
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
sk->streamingAt = sk->startStreamingAt;
|
||||
|
||||
/*
|
||||
* Donors can only be in SS_ACTIVE state, so we potentially update the
|
||||
* donor when we switch one to SS_ACTIVE.
|
||||
*/
|
||||
UpdateDonorShmem(sk->wp);
|
||||
|
||||
/* event set will be updated inside SendMessageToNode */
|
||||
SendMessageToNode(sk);
|
||||
}
|
||||
@@ -1581,17 +1568,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
|
||||
* none if it doesn't exist. donor_lsn is set to end position of the donor to
|
||||
* the best of our knowledge.
|
||||
*/
|
||||
static void
|
||||
UpdateDonorShmem(WalProposer *wp)
|
||||
Safekeeper *
|
||||
GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
||||
{
|
||||
Safekeeper *donor = NULL;
|
||||
int i;
|
||||
XLogRecPtr donor_lsn = InvalidXLogRecPtr;
|
||||
*donor_lsn = InvalidXLogRecPtr;
|
||||
|
||||
if (wp->n_votes < wp->quorum)
|
||||
{
|
||||
wp_log(WARNING, "UpdateDonorShmem called before elections are won");
|
||||
return;
|
||||
wp_log(WARNING, "GetDonor called before elections are won");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1602,7 +1589,7 @@ UpdateDonorShmem(WalProposer *wp)
|
||||
if (wp->safekeeper[wp->donor].state >= SS_IDLE)
|
||||
{
|
||||
donor = &wp->safekeeper[wp->donor];
|
||||
donor_lsn = wp->propEpochStartLsn;
|
||||
*donor_lsn = wp->propEpochStartLsn;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1614,19 +1601,13 @@ UpdateDonorShmem(WalProposer *wp)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn)
|
||||
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
|
||||
{
|
||||
donor = sk;
|
||||
donor_lsn = sk->appendResponse.flushLsn;
|
||||
*donor_lsn = sk->appendResponse.flushLsn;
|
||||
}
|
||||
}
|
||||
|
||||
if (donor == NULL)
|
||||
{
|
||||
wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping");
|
||||
return;
|
||||
}
|
||||
wp->api.update_donor(wp, donor, donor_lsn);
|
||||
return donor;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1636,7 +1617,7 @@ static void
|
||||
HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
|
||||
{
|
||||
XLogRecPtr candidateTruncateLsn;
|
||||
XLogRecPtr newCommitLsn;
|
||||
XLogRecPtr newCommitLsn;
|
||||
|
||||
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
if (newCommitLsn > wp->commitLsn)
|
||||
@@ -1646,7 +1627,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
|
||||
BroadcastAppendRequest(wp);
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
|
||||
* The last one will terminate the process if the shutdown is requested
|
||||
* and WAL is committed by the quorum. BroadcastAppendRequest() should be
|
||||
|
||||
@@ -284,19 +284,14 @@ typedef struct PageserverFeedback
|
||||
|
||||
typedef struct WalproposerShmemState
|
||||
{
|
||||
pg_atomic_uint64 propEpochStartLsn;
|
||||
char donor_name[64];
|
||||
char donor_conninfo[MAXCONNINFO];
|
||||
XLogRecPtr donor_lsn;
|
||||
|
||||
slock_t mutex;
|
||||
pg_atomic_uint64 mineLastElectedTerm;
|
||||
term_t mineLastElectedTerm;
|
||||
pg_atomic_uint64 backpressureThrottlingTime;
|
||||
pg_atomic_uint64 currentClusterSize;
|
||||
|
||||
/* last feedback from each shard */
|
||||
PageserverFeedback shard_ps_feedback[MAX_SHARDS];
|
||||
int num_shards;
|
||||
int num_shards;
|
||||
|
||||
/* aggregated feedback with min LSNs across shards */
|
||||
PageserverFeedback min_ps_feedback;
|
||||
@@ -470,9 +465,6 @@ typedef struct walproposer_api
|
||||
/* Get pointer to the latest available WAL. */
|
||||
XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp);
|
||||
|
||||
/* Update current donor info in WalProposer Shmem */
|
||||
void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn);
|
||||
|
||||
/* Get current time. */
|
||||
TimestampTz (*get_current_timestamp) (WalProposer *wp);
|
||||
|
||||
@@ -505,7 +497,7 @@ typedef struct walproposer_api
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*
|
||||
*
|
||||
* Returns PG_ASYNC_READ_FAIL on closed connection.
|
||||
*/
|
||||
PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
|
||||
@@ -553,14 +545,13 @@ typedef struct walproposer_api
|
||||
* Returns 0 if timeout is reached, 1 if some event happened. Updates
|
||||
* events mask to indicate events and sets sk to the safekeeper which has
|
||||
* an event.
|
||||
*
|
||||
*
|
||||
* On timeout, events is set to WL_NO_EVENTS. On socket event, events is
|
||||
* set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is
|
||||
* closed, events is set to WL_SOCKET_READABLE.
|
||||
*
|
||||
* WL_SOCKET_WRITEABLE is usually set only when we need to flush the
|
||||
* buffer. It can be returned only if caller asked for this event in the
|
||||
* last *_event_set call.
|
||||
*
|
||||
* WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer.
|
||||
* It can be returned only if caller asked for this event in the last *_event_set call.
|
||||
*/
|
||||
int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
|
||||
|
||||
@@ -580,9 +571,9 @@ typedef struct walproposer_api
|
||||
void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Called after every AppendResponse from the safekeeper. Used to
|
||||
* propagate backpressure feedback and to confirm WAL persistence (has
|
||||
* been commited on the quorum of safekeepers).
|
||||
* Called after every AppendResponse from the safekeeper. Used to propagate
|
||||
* backpressure feedback and to confirm WAL persistence (has been commited
|
||||
* on the quorum of safekeepers).
|
||||
*/
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
|
||||
|
||||
@@ -725,14 +716,12 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
|
||||
extern void WalProposerPoll(WalProposer *wp);
|
||||
extern void WalProposerFree(WalProposer *wp);
|
||||
|
||||
extern WalproposerShmemState *GetWalpropShmemState();
|
||||
|
||||
/*
|
||||
* WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
|
||||
* recreate set from scratch, hence the export.
|
||||
*/
|
||||
extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
|
||||
extern TimeLineID walprop_pg_get_timeline_id(void);
|
||||
extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
|
||||
|
||||
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal
|
||||
|
||||
@@ -85,6 +85,7 @@ static void walprop_pg_init_standalone_sync_safekeepers(void);
|
||||
static void walprop_pg_init_walsender(void);
|
||||
static void walprop_pg_init_bgworker(void);
|
||||
static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
|
||||
static TimeLineID walprop_pg_get_timeline_id(void);
|
||||
static void walprop_pg_load_libpqwalreceiver(void);
|
||||
|
||||
static process_interrupts_callback_t PrevProcessInterruptsCallback;
|
||||
@@ -93,8 +94,6 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type;
|
||||
static shmem_request_hook_type prev_shmem_request_hook = NULL;
|
||||
static void walproposer_shmem_request(void);
|
||||
#endif
|
||||
static void WalproposerShmemInit_SyncSafekeeper(void);
|
||||
|
||||
|
||||
static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
|
||||
static void WalSndLoop(WalProposer *wp);
|
||||
@@ -137,7 +136,6 @@ WalProposerSync(int argc, char *argv[])
|
||||
WalProposer *wp;
|
||||
|
||||
init_walprop_config(true);
|
||||
WalproposerShmemInit_SyncSafekeeper();
|
||||
walprop_pg_init_standalone_sync_safekeepers();
|
||||
walprop_pg_load_libpqwalreceiver();
|
||||
|
||||
@@ -283,8 +281,6 @@ WalproposerShmemInit(void)
|
||||
{
|
||||
memset(walprop_shared, 0, WalproposerShmemSize());
|
||||
SpinLockInit(&walprop_shared->mutex);
|
||||
pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
|
||||
pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
|
||||
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
|
||||
pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
|
||||
}
|
||||
@@ -293,17 +289,6 @@ WalproposerShmemInit(void)
|
||||
return found;
|
||||
}
|
||||
|
||||
static void
|
||||
WalproposerShmemInit_SyncSafekeeper(void)
|
||||
{
|
||||
walprop_shared = palloc(WalproposerShmemSize());
|
||||
memset(walprop_shared, 0, WalproposerShmemSize());
|
||||
SpinLockInit(&walprop_shared->mutex);
|
||||
pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
|
||||
pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
|
||||
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
|
||||
}
|
||||
|
||||
#define BACK_PRESSURE_DELAY 10000L // 0.01 sec
|
||||
|
||||
static bool
|
||||
@@ -414,13 +399,6 @@ nwp_shmem_startup_hook(void)
|
||||
WalproposerShmemInit();
|
||||
}
|
||||
|
||||
WalproposerShmemState *
|
||||
GetWalpropShmemState()
|
||||
{
|
||||
Assert(walprop_shared != NULL);
|
||||
return walprop_shared;
|
||||
}
|
||||
|
||||
static WalproposerShmemState *
|
||||
walprop_pg_get_shmem_state(WalProposer *wp)
|
||||
{
|
||||
@@ -453,15 +431,14 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback)
|
||||
for (int i = 0; i < walprop_shared->num_shards; i++)
|
||||
{
|
||||
PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
|
||||
|
||||
if (feedback->present)
|
||||
{
|
||||
if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
|
||||
min_feedback.last_received_lsn = feedback->last_received_lsn;
|
||||
|
||||
|
||||
if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
|
||||
min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
|
||||
|
||||
|
||||
if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
|
||||
min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
|
||||
}
|
||||
@@ -574,7 +551,6 @@ static void
|
||||
walprop_sigusr2(SIGNAL_ARGS)
|
||||
{
|
||||
int save_errno = errno;
|
||||
|
||||
got_SIGUSR2 = true;
|
||||
SetLatch(MyLatch);
|
||||
errno = save_errno;
|
||||
@@ -622,7 +598,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp)
|
||||
return GetCurrentTimestamp();
|
||||
}
|
||||
|
||||
TimeLineID
|
||||
static TimeLineID
|
||||
walprop_pg_get_timeline_id(void)
|
||||
{
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
@@ -641,20 +617,6 @@ walprop_pg_load_libpqwalreceiver(void)
|
||||
wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn)
|
||||
{
|
||||
WalproposerShmemState *wps = wp->api.get_shmem_state(wp);
|
||||
char donor_name[64];
|
||||
|
||||
pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port);
|
||||
SpinLockAcquire(&wps->mutex);
|
||||
memcpy(wps->donor_name, donor_name, sizeof(donor_name));
|
||||
memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo));
|
||||
wps->donor_lsn = donor_lsn;
|
||||
SpinLockRelease(&wps->mutex);
|
||||
}
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
@@ -755,6 +717,7 @@ walprop_connect_start(Safekeeper *sk)
|
||||
{
|
||||
Assert(sk->conn == NULL);
|
||||
sk->conn = libpqwp_connect_start(sk->conninfo);
|
||||
|
||||
}
|
||||
|
||||
static WalProposerConnectPollStatusType
|
||||
@@ -1128,7 +1091,7 @@ static void
|
||||
StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
{
|
||||
XLogRecPtr FlushPtr;
|
||||
__attribute__((unused)) TimeLineID currTLI;
|
||||
__attribute__((unused)) TimeLineID currTLI;
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (ThisTimeLineID == 0)
|
||||
@@ -1332,13 +1295,116 @@ XLogBroadcastWalProposer(WalProposer *wp)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Used to download WAL before basebackup for logical walsenders from sk, no longer
|
||||
needed because walsender always uses neon_walreader.
|
||||
*/
|
||||
/* Download WAL before basebackup for logical walsenders from sk, if needed */
|
||||
static bool
|
||||
WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
{
|
||||
char *err;
|
||||
WalReceiverConn *wrconn;
|
||||
WalRcvStreamOptions options;
|
||||
char conninfo[MAXCONNINFO];
|
||||
TimeLineID timeline;
|
||||
XLogRecPtr startpos;
|
||||
XLogRecPtr endpos;
|
||||
|
||||
startpos = GetLogRepRestartLSN(wp);
|
||||
if (startpos == InvalidXLogRecPtr)
|
||||
return true; /* recovery not needed */
|
||||
endpos = wp->propEpochStartLsn;
|
||||
|
||||
timeline = wp->greetRequest.timeline;
|
||||
|
||||
if (!neon_auth_token)
|
||||
{
|
||||
memcpy(conninfo, sk->conninfo, MAXCONNINFO);
|
||||
}
|
||||
else
|
||||
{
|
||||
int written = 0;
|
||||
|
||||
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
wpg_log(FATAL, "could not append password to the safekeeper connection string");
|
||||
}
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
|
||||
#else
|
||||
wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err);
|
||||
#endif
|
||||
|
||||
if (!wrconn)
|
||||
{
|
||||
ereport(WARNING,
|
||||
(errmsg("could not connect to WAL acceptor %s:%s: %s",
|
||||
sk->host, sk->port,
|
||||
err)));
|
||||
return false;
|
||||
}
|
||||
wpg_log(LOG,
|
||||
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
|
||||
"%d",
|
||||
sk->host, sk->port, (uint32) (startpos >> 32),
|
||||
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
|
||||
|
||||
options.logical = false;
|
||||
options.startpoint = startpos;
|
||||
options.slotname = NULL;
|
||||
options.proto.physical.startpointTLI = timeline;
|
||||
|
||||
if (walrcv_startstreaming(wrconn, &options))
|
||||
{
|
||||
XLogRecPtr rec_start_lsn;
|
||||
XLogRecPtr rec_end_lsn = 0;
|
||||
int len;
|
||||
char *buf;
|
||||
pgsocket wait_fd = PGINVALID_SOCKET;
|
||||
|
||||
while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
|
||||
{
|
||||
if (len == 0)
|
||||
{
|
||||
(void) WaitLatchOrSocket(
|
||||
MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
|
||||
-1, WAIT_EVENT_WAL_RECEIVER_MAIN);
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert(buf[0] == 'w' || buf[0] == 'k');
|
||||
if (buf[0] == 'k')
|
||||
continue; /* keepalive */
|
||||
memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
|
||||
sizeof rec_start_lsn);
|
||||
rec_start_lsn = pg_ntoh64(rec_start_lsn);
|
||||
rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
|
||||
|
||||
/* write WAL to disk */
|
||||
XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
|
||||
|
||||
ereport(DEBUG1,
|
||||
(errmsg("Recover message %X/%X length %d",
|
||||
LSN_FORMAT_ARGS(rec_start_lsn), len)));
|
||||
if (rec_end_lsn >= endpos)
|
||||
break;
|
||||
}
|
||||
}
|
||||
ereport(LOG,
|
||||
(errmsg("end of replication stream at %X/%X: %m",
|
||||
LSN_FORMAT_ARGS(rec_end_lsn))));
|
||||
walrcv_disconnect(wrconn);
|
||||
|
||||
/* failed to receive all WAL till endpos */
|
||||
if (rec_end_lsn < endpos)
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(LOG,
|
||||
(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
|
||||
timeline, (uint32) (startpos >> 32), (uint32) startpos)));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1479,7 +1545,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
|
||||
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
Assert(!sk->xlogreader);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
|
||||
if (sk->xlogreader == NULL)
|
||||
wpg_log(FATAL, "failed to allocate xlog reader");
|
||||
}
|
||||
@@ -1894,8 +1960,8 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
|
||||
static void
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
|
||||
{
|
||||
HotStandbyFeedback hsFeedback;
|
||||
bool needToAdvanceSlot = false;
|
||||
HotStandbyFeedback hsFeedback;
|
||||
bool needToAdvanceSlot = false;
|
||||
|
||||
if (wp->config->syncSafekeepers)
|
||||
return;
|
||||
@@ -2029,25 +2095,22 @@ GetLogRepRestartLSN(WalProposer *wp)
|
||||
return lrRestartLsn;
|
||||
}
|
||||
|
||||
void
|
||||
SetNeonCurrentClusterSize(uint64 size)
|
||||
void SetNeonCurrentClusterSize(uint64 size)
|
||||
{
|
||||
pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
|
||||
}
|
||||
|
||||
uint64
|
||||
GetNeonCurrentClusterSize(void)
|
||||
uint64 GetNeonCurrentClusterSize(void)
|
||||
{
|
||||
return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
|
||||
}
|
||||
uint64 GetNeonCurrentClusterSize(void);
|
||||
uint64 GetNeonCurrentClusterSize(void);
|
||||
|
||||
|
||||
static const walproposer_api walprop_pg = {
|
||||
.get_shmem_state = walprop_pg_get_shmem_state,
|
||||
.start_streaming = walprop_pg_start_streaming,
|
||||
.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
|
||||
.update_donor = walprop_pg_update_donor,
|
||||
.get_current_timestamp = walprop_pg_get_current_timestamp,
|
||||
.conn_error_message = walprop_error_message,
|
||||
.conn_status = walprop_status,
|
||||
|
||||
@@ -1,172 +0,0 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* walsender_hooks.c
|
||||
*
|
||||
* Implements XLogReaderRoutine in terms of NeonWALReader. Allows for
|
||||
* fetching WAL from safekeepers, which normal xlogreader can't do.
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "walsender_hooks.h"
|
||||
#include "postgres.h"
|
||||
#include "fmgr.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/wait_event.h"
|
||||
#include "utils/guc.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
static NeonWALReader *wal_reader = NULL;
|
||||
extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
|
||||
extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
|
||||
|
||||
static XLogRecPtr
|
||||
NeonWALReadWaitForWAL(XLogRecPtr loc)
|
||||
{
|
||||
while (!NeonWALReaderUpdateDonor(wal_reader))
|
||||
{
|
||||
pg_usleep(1000);
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
|
||||
return WalSndWaitForWal(loc);
|
||||
}
|
||||
|
||||
static int
|
||||
NeonWALPageRead(
|
||||
XLogReaderState *xlogreader,
|
||||
XLogRecPtr targetPagePtr,
|
||||
int reqLen,
|
||||
XLogRecPtr targetRecPtr,
|
||||
char *readBuf)
|
||||
{
|
||||
XLogRecPtr rem_lsn;
|
||||
|
||||
/* Wait for flush pointer to advance past our request */
|
||||
XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen);
|
||||
int count;
|
||||
|
||||
if (flushptr < targetPagePtr + reqLen)
|
||||
return -1;
|
||||
|
||||
/* Read at most XLOG_BLCKSZ bytes */
|
||||
if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
|
||||
count = XLOG_BLCKSZ;
|
||||
else
|
||||
count = flushptr - targetPagePtr;
|
||||
|
||||
/*
|
||||
* Sometimes walsender requests non-monotonic sequences of WAL. If that's
|
||||
* the case, we have to reset streaming from remote at the correct
|
||||
* position. For example, walsender may try to verify the segment header
|
||||
* when trying to read in the middle of it.
|
||||
*/
|
||||
rem_lsn = NeonWALReaderGetRemLsn(wal_reader);
|
||||
if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn)
|
||||
{
|
||||
NeonWALReaderResetRemote(wal_reader);
|
||||
}
|
||||
|
||||
for (;;)
|
||||
{
|
||||
NeonWALReadResult res = NeonWALRead(
|
||||
wal_reader,
|
||||
readBuf,
|
||||
targetPagePtr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
{
|
||||
/*
|
||||
* Setting ws_tli is required by the XLogReaderRoutine, it is used
|
||||
* for segment name generation in error reports.
|
||||
*
|
||||
* ReadPageInternal updates ws_segno after calling cb on its own
|
||||
* and XLogReaderRoutine description doesn't require it, but
|
||||
* WALRead sets, let's follow it.
|
||||
*/
|
||||
xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli;
|
||||
xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno;
|
||||
|
||||
/*
|
||||
* ws_file doesn't exist in case of remote read, and isn't used by
|
||||
* xlogreader except by WALRead on which we don't rely anyway.
|
||||
*/
|
||||
return count;
|
||||
}
|
||||
if (res == NEON_WALREAD_ERROR)
|
||||
{
|
||||
elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s",
|
||||
LSN_FORMAT_ARGS(targetPagePtr),
|
||||
reqLen,
|
||||
NeonWALReaderErrMsg(wal_reader));
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Res is WOULDBLOCK, so we wait on the socket, recreating event set
|
||||
* if necessary
|
||||
*/
|
||||
{
|
||||
|
||||
pgsocket sock = NeonWALReaderSocket(wal_reader);
|
||||
uint32_t reader_events = NeonWALReaderEvents(wal_reader);
|
||||
long timeout_ms = 1000;
|
||||
|
||||
ResetLatch(MyLatch);
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
if (ConfigReloadPending)
|
||||
{
|
||||
ConfigReloadPending = false;
|
||||
ProcessConfigFile(PGC_SIGHUP);
|
||||
}
|
||||
|
||||
WaitLatchOrSocket(
|
||||
MyLatch,
|
||||
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events,
|
||||
sock,
|
||||
timeout_ms,
|
||||
WAIT_EVENT_WAL_SENDER_MAIN);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p)
|
||||
{
|
||||
neon_wal_segment_open(wal_reader, nextSegNo, tli_p);
|
||||
xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
|
||||
}
|
||||
|
||||
static void
|
||||
NeonWALReadSegmentClose(XLogReaderState *xlogreader)
|
||||
{
|
||||
neon_wal_segment_close(wal_reader);
|
||||
xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
|
||||
}
|
||||
|
||||
void
|
||||
NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
|
||||
{
|
||||
if (!wal_reader)
|
||||
{
|
||||
XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
|
||||
|
||||
if (epochStartLsn == 0)
|
||||
{
|
||||
elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!");
|
||||
}
|
||||
wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] ");
|
||||
}
|
||||
xlr->page_read = NeonWALPageRead;
|
||||
xlr->segment_open = NeonWALReadSegmentOpen;
|
||||
xlr->segment_close = NeonWALReadSegmentClose;
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
#ifndef __WALSENDER_HOOKS_H__
|
||||
#define __WALSENDER_HOOKS_H__
|
||||
|
||||
struct XLogReaderRoutine;
|
||||
void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr);
|
||||
|
||||
#endif
|
||||
21
poetry.lock
generated
21
poetry.lock
generated
@@ -1001,17 +1001,18 @@ dotenv = ["python-dotenv"]
|
||||
|
||||
[[package]]
|
||||
name = "flask-cors"
|
||||
version = "4.0.1"
|
||||
version = "3.0.10"
|
||||
description = "A Flask extension adding a decorator for CORS support"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
|
||||
{file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
|
||||
{file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"},
|
||||
{file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Flask = ">=0.9"
|
||||
Six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "frozenlist"
|
||||
@@ -1242,13 +1243,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "jinja2"
|
||||
version = "3.1.4"
|
||||
version = "3.1.3"
|
||||
description = "A very fast and expressive template engine."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
|
||||
{file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
|
||||
{file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
|
||||
{file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2611,13 +2612,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.0.3"
|
||||
version = "3.0.1"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
|
||||
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
|
||||
{file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
|
||||
{file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2899,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb"
|
||||
content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
|
||||
|
||||
@@ -40,7 +40,6 @@ hyper.workspace = true
|
||||
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
http-body-util = { version = "0.1" }
|
||||
indexmap.workspace = true
|
||||
ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
lasso = { workspace = true, features = ["multi-threaded"] }
|
||||
|
||||
@@ -27,7 +27,6 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
use proxy::redis::notifications;
|
||||
use proxy::serverless::cancel_set::CancelSet;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::usage_metrics;
|
||||
|
||||
@@ -244,12 +243,6 @@ struct SqlOverHttpArgs {
|
||||
/// increase memory used by the pool
|
||||
#[clap(long, default_value_t = 128)]
|
||||
sql_over_http_pool_shards: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10000)]
|
||||
sql_over_http_client_conn_threshold: u64,
|
||||
|
||||
#[clap(long, default_value_t = 64)]
|
||||
sql_over_http_cancel_set_shards: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -349,7 +342,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
let redis_publisher = match &redis_notifications_client {
|
||||
let redis_publisher = match ®ional_redis_client {
|
||||
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
redis_publisher.clone(),
|
||||
args.region.clone(),
|
||||
@@ -606,8 +599,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
|
||||
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
},
|
||||
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::{
|
||||
auth::parse_endpoint_param,
|
||||
cancellation::CancelClosure,
|
||||
console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError},
|
||||
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
|
||||
context::RequestMonitoring,
|
||||
error::{ReportableError, UserFacingError},
|
||||
metrics::{Metrics, NumDbConnectionsGuard},
|
||||
@@ -34,9 +34,6 @@ pub enum ConnectionError {
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
WakeComputeError(#[from] WakeComputeError),
|
||||
|
||||
#[error("error acquiring resource permit: {0}")]
|
||||
TooManyConnectionAttempts(#[from] ApiLockError),
|
||||
}
|
||||
|
||||
impl UserFacingError for ConnectionError {
|
||||
@@ -60,9 +57,6 @@ impl UserFacingError for ConnectionError {
|
||||
None => err.to_string(),
|
||||
},
|
||||
WakeComputeError(err) => err.to_string_client(),
|
||||
TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
_ => COULD_NOT_CONNECT.to_owned(),
|
||||
}
|
||||
}
|
||||
@@ -78,7 +72,6 @@ impl ReportableError for ConnectionError {
|
||||
ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::WakeComputeError(e) => e.get_error_kind(),
|
||||
ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::{
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
console::locks::ApiLocks,
|
||||
rate_limiter::RateBucketInfo,
|
||||
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
serverless::GlobalConnPoolOptions,
|
||||
Host,
|
||||
};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
@@ -56,8 +56,6 @@ pub struct TlsConfig {
|
||||
pub struct HttpConfig {
|
||||
pub request_timeout: tokio::time::Duration,
|
||||
pub pool_options: GlobalConnPoolOptions,
|
||||
pub cancel_set: CancelSet,
|
||||
pub client_conn_threshold: u64,
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
@@ -538,9 +536,9 @@ pub struct RetryConfig {
|
||||
impl RetryConfig {
|
||||
/// Default options for RetryConfig.
|
||||
|
||||
/// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
|
||||
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
|
||||
pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
|
||||
"num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2";
|
||||
"num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
|
||||
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
|
||||
/// Cplane has timeout of 60s on each request. 8m7s in total.
|
||||
pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
|
||||
@@ -594,7 +592,7 @@ impl ConcurrencyLockOptions {
|
||||
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
|
||||
/// Default options for [`crate::console::provider::ApiLocks`].
|
||||
pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
|
||||
"shards=64,permits=10,epoch=10m,timeout=10ms";
|
||||
"shards=64,permits=50,epoch=10m,timeout=500ms";
|
||||
|
||||
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ use crate::{
|
||||
compute,
|
||||
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
|
||||
context::RequestMonitoring,
|
||||
error::ReportableError,
|
||||
intern::ProjectIdInt,
|
||||
metrics::ApiLockMetrics,
|
||||
scram, EndpointCacheKey,
|
||||
@@ -31,8 +30,6 @@ pub mod errors {
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
use super::ApiLockError;
|
||||
|
||||
/// A go-to error message which doesn't leak any detail.
|
||||
const REQUEST_FAILED: &str = "Console request failed";
|
||||
|
||||
@@ -79,7 +76,7 @@ pub mod errors {
|
||||
}
|
||||
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
|
||||
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
||||
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
|
||||
}
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
},
|
||||
@@ -214,8 +211,8 @@ pub mod errors {
|
||||
#[error("Too many connections attempts")]
|
||||
TooManyConnections,
|
||||
|
||||
#[error("error acquiring resource permit: {0}")]
|
||||
TooManyConnectionAttempts(#[from] ApiLockError),
|
||||
#[error("Timeout waiting to acquire wake compute lock")]
|
||||
TimeoutError,
|
||||
}
|
||||
|
||||
// This allows more useful interactions than `#[from]`.
|
||||
@@ -225,6 +222,17 @@ pub mod errors {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<tokio::sync::AcquireError> for WakeComputeError {
|
||||
fn from(_: tokio::sync::AcquireError) -> Self {
|
||||
WakeComputeError::TimeoutError
|
||||
}
|
||||
}
|
||||
impl From<tokio::time::error::Elapsed> for WakeComputeError {
|
||||
fn from(_: tokio::time::error::Elapsed) -> Self {
|
||||
WakeComputeError::TimeoutError
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for WakeComputeError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use WakeComputeError::*;
|
||||
@@ -237,9 +245,7 @@ pub mod errors {
|
||||
|
||||
TooManyConnections => self.to_string(),
|
||||
|
||||
TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -250,7 +256,7 @@ pub mod errors {
|
||||
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
|
||||
WakeComputeError::ApiError(e) => e.get_error_kind(),
|
||||
WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
|
||||
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -450,23 +456,6 @@ pub struct ApiLocks<K> {
|
||||
metrics: &'static ApiLockMetrics,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ApiLockError {
|
||||
#[error("lock was closed")]
|
||||
AcquireError(#[from] tokio::sync::AcquireError),
|
||||
#[error("permit could not be acquired")]
|
||||
TimeoutError(#[from] tokio::time::error::Elapsed),
|
||||
}
|
||||
|
||||
impl ReportableError for ApiLockError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
|
||||
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
pub fn new(
|
||||
name: &'static str,
|
||||
@@ -486,7 +475,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
|
||||
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
|
||||
if self.permits == 0 {
|
||||
return Ok(WakeComputePermit { permit: None });
|
||||
}
|
||||
|
||||
@@ -86,8 +86,6 @@ impl ShouldRetry for compute::ConnectionError {
|
||||
match self {
|
||||
compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
|
||||
compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
|
||||
// the cache entry was not checked for validity
|
||||
compute::ConnectionError::TooManyConnectionAttempts(_) => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
WakeupFailureKind::ApiConsoleOtherError
|
||||
}
|
||||
WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
|
||||
WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
|
||||
WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
|
||||
};
|
||||
Metrics::get()
|
||||
.proxy
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//! Handles both SQL over HTTP and SQL over Websockets.
|
||||
|
||||
mod backend;
|
||||
pub mod cancel_set;
|
||||
mod conn_pool;
|
||||
mod http_util;
|
||||
mod json;
|
||||
@@ -110,37 +109,20 @@ pub async fn task_main(
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
|
||||
|
||||
let n_connections = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
.sample(crate::metrics::Protocol::Http);
|
||||
tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check");
|
||||
if n_connections > config.http_config.client_conn_threshold {
|
||||
tracing::trace!("attempting to cancel a random connection");
|
||||
if let Some(token) = config.http_config.cancel_set.take() {
|
||||
tracing::debug!("cancelling a random connection");
|
||||
token.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
let conn_token = cancellation_token.child_token();
|
||||
let conn = connection_handler(
|
||||
config,
|
||||
backend.clone(),
|
||||
connections.clone(),
|
||||
cancellation_handler.clone(),
|
||||
conn_token.clone(),
|
||||
server.clone(),
|
||||
tls_acceptor.clone(),
|
||||
conn,
|
||||
peer_addr,
|
||||
)
|
||||
.instrument(http_conn_span);
|
||||
|
||||
connections.spawn(async move {
|
||||
let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token);
|
||||
conn.await
|
||||
});
|
||||
connections.spawn(
|
||||
connection_handler(
|
||||
config,
|
||||
backend.clone(),
|
||||
connections.clone(),
|
||||
cancellation_handler.clone(),
|
||||
cancellation_token.clone(),
|
||||
server.clone(),
|
||||
tls_acceptor.clone(),
|
||||
conn,
|
||||
peer_addr,
|
||||
)
|
||||
.instrument(http_conn_span),
|
||||
);
|
||||
}
|
||||
|
||||
connections.wait().await;
|
||||
@@ -261,7 +243,6 @@ async fn connection_handler(
|
||||
// On cancellation, trigger the HTTP connection handler to shut down.
|
||||
let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
|
||||
Either::Left((_cancelled, mut conn)) => {
|
||||
tracing::debug!(%peer_addr, "cancelling connection");
|
||||
conn.as_mut().graceful_shutdown();
|
||||
conn.await
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@ use crate::{
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
locks::ApiLocks,
|
||||
provider::ApiLockError,
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
@@ -132,8 +131,6 @@ pub enum HttpConnError {
|
||||
AuthError(#[from] AuthError),
|
||||
#[error("wake_compute returned error")]
|
||||
WakeCompute(#[from] WakeComputeError),
|
||||
#[error("error acquiring resource permit: {0}")]
|
||||
TooManyConnectionAttempts(#[from] ApiLockError),
|
||||
}
|
||||
|
||||
impl ReportableError for HttpConnError {
|
||||
@@ -144,7 +141,6 @@ impl ReportableError for HttpConnError {
|
||||
HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
|
||||
HttpConnError::AuthError(a) => a.get_error_kind(),
|
||||
HttpConnError::WakeCompute(w) => w.get_error_kind(),
|
||||
HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -157,9 +153,6 @@ impl UserFacingError for HttpConnError {
|
||||
HttpConnError::GetAuthInfo(c) => c.to_string_client(),
|
||||
HttpConnError::AuthError(c) => c.to_string_client(),
|
||||
HttpConnError::WakeCompute(c) => c.to_string_client(),
|
||||
HttpConnError::TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -172,15 +165,6 @@ impl ShouldRetry for HttpConnError {
|
||||
HttpConnError::GetAuthInfo(_) => false,
|
||||
HttpConnError::AuthError(_) => false,
|
||||
HttpConnError::WakeCompute(_) => false,
|
||||
HttpConnError::TooManyConnectionAttempts(_) => false,
|
||||
}
|
||||
}
|
||||
fn should_retry_database_address(&self) -> bool {
|
||||
match self {
|
||||
HttpConnError::ConnectionError(e) => e.should_retry_database_address(),
|
||||
// we never checked cache validity
|
||||
HttpConnError::TooManyConnectionAttempts(_) => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,102 +0,0 @@
|
||||
//! A set for cancelling random http connections
|
||||
|
||||
use std::{
|
||||
hash::{BuildHasher, BuildHasherDefault},
|
||||
num::NonZeroUsize,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use indexmap::IndexMap;
|
||||
use parking_lot::Mutex;
|
||||
use rand::{thread_rng, Rng};
|
||||
use rustc_hash::FxHasher;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use uuid::Uuid;
|
||||
|
||||
type Hasher = BuildHasherDefault<FxHasher>;
|
||||
|
||||
pub struct CancelSet {
|
||||
shards: Box<[Mutex<CancelShard>]>,
|
||||
// keyed by random uuid, fxhasher is fine
|
||||
hasher: Hasher,
|
||||
}
|
||||
|
||||
pub struct CancelShard {
|
||||
tokens: IndexMap<uuid::Uuid, (Instant, CancellationToken), Hasher>,
|
||||
}
|
||||
|
||||
impl CancelSet {
|
||||
pub fn new(shards: usize) -> Self {
|
||||
CancelSet {
|
||||
shards: (0..shards)
|
||||
.map(|_| {
|
||||
Mutex::new(CancelShard {
|
||||
tokens: IndexMap::with_hasher(Hasher::default()),
|
||||
})
|
||||
})
|
||||
.collect(),
|
||||
hasher: Hasher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn take(&self) -> Option<CancellationToken> {
|
||||
for _ in 0..4 {
|
||||
if let Some(token) = self.take_raw(thread_rng().gen()) {
|
||||
return Some(token);
|
||||
}
|
||||
tracing::trace!("failed to get cancel token");
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
|
||||
NonZeroUsize::new(self.shards.len())
|
||||
.and_then(|len| self.shards[rng % len].lock().take(rng / len))
|
||||
}
|
||||
|
||||
pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
|
||||
let shard = NonZeroUsize::new(self.shards.len()).map(|len| {
|
||||
let hash = self.hasher.hash_one(id) as usize;
|
||||
let shard = &self.shards[hash % len];
|
||||
shard.lock().insert(id, token);
|
||||
shard
|
||||
});
|
||||
CancelGuard { shard, id }
|
||||
}
|
||||
}
|
||||
|
||||
impl CancelShard {
|
||||
fn take(&mut self, rng: usize) -> Option<CancellationToken> {
|
||||
NonZeroUsize::new(self.tokens.len()).and_then(|len| {
|
||||
// 10 second grace period so we don't cancel new connections
|
||||
if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?;
|
||||
Some(token)
|
||||
})
|
||||
}
|
||||
|
||||
fn remove(&mut self, id: uuid::Uuid) {
|
||||
self.tokens.swap_remove(&id);
|
||||
}
|
||||
|
||||
fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) {
|
||||
self.tokens.insert(id, (Instant::now(), token));
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CancelGuard<'a> {
|
||||
shard: Option<&'a Mutex<CancelShard>>,
|
||||
id: Uuid,
|
||||
}
|
||||
|
||||
impl Drop for CancelGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some(shard) = self.shard {
|
||||
shard.lock().remove(self.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -716,7 +716,7 @@ impl<C: ClientInnerExt> Drop for Client<C> {
|
||||
mod tests {
|
||||
use std::{mem, sync::atomic::AtomicBool};
|
||||
|
||||
use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId};
|
||||
use crate::{BranchId, EndpointId, ProjectId};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -767,8 +767,6 @@ mod tests {
|
||||
max_total_conns: 3,
|
||||
},
|
||||
request_timeout: Duration::from_secs(1),
|
||||
cancel_set: CancelSet::new(0),
|
||||
client_conn_threshold: u64::MAX,
|
||||
}));
|
||||
let pool = GlobalConnPool::new(config);
|
||||
let conn_info = ConnInfo {
|
||||
|
||||
@@ -424,8 +424,8 @@ pub enum SqlOverHttpCancel {
|
||||
impl ReportableError for SqlOverHttpCancel {
|
||||
fn get_error_kind(&self) -> ErrorKind {
|
||||
match self {
|
||||
SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect,
|
||||
SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect,
|
||||
SqlOverHttpCancel::Postgres => ErrorKind::RateLimit,
|
||||
SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ requests = "^2.31.0"
|
||||
pytest-xdist = "^3.3.1"
|
||||
asyncpg = "^0.29.0"
|
||||
aiopg = "^1.4.0"
|
||||
Jinja2 = "^3.1.4"
|
||||
Jinja2 = "^3.1.3"
|
||||
types-requests = "^2.31.0.0"
|
||||
types-psycopg2 = "^2.9.21.10"
|
||||
boto3 = "^1.34.11"
|
||||
@@ -24,7 +24,7 @@ backoff = "^2.2.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "^3.0.3"
|
||||
Werkzeug = "^3.0.1"
|
||||
pytest-order = "^1.1.0"
|
||||
allure-pytest = "^2.13.2"
|
||||
pytest-asyncio = "^0.21.0"
|
||||
|
||||
@@ -725,18 +725,6 @@ where
|
||||
self.state.inmem.commit_lsn
|
||||
);
|
||||
|
||||
// Before first WAL write initialize its segment. It makes first segment
|
||||
// pg_waldump'able because stream from compute doesn't include its
|
||||
// segment and page headers.
|
||||
//
|
||||
// If we fail before first WAL write flush this action would be
|
||||
// repeated, that's ok because it is idempotent.
|
||||
if self.wal_store.flush_lsn() == Lsn::INVALID {
|
||||
self.wal_store
|
||||
.initialize_first_segment(msg.start_streaming_at)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
|
||||
// intersection of our history and history from msg
|
||||
|
||||
@@ -1019,10 +1007,6 @@ mod tests {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
self.lsn = startpos + buf.len() as u64;
|
||||
Ok(())
|
||||
|
||||
@@ -506,8 +506,6 @@ struct WalSender<'a, IO> {
|
||||
send_buf: [u8; MAX_SEND_SIZE],
|
||||
}
|
||||
|
||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
/// Send WAL until
|
||||
/// - an error occurs
|
||||
@@ -586,22 +584,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
loop {
|
||||
self.end_pos = self.end_watch.get();
|
||||
let have_something_to_send = (|| {
|
||||
fail::fail_point!(
|
||||
"sk-pause-send",
|
||||
self.appname.as_deref() != Some("pageserver"),
|
||||
|_| { false }
|
||||
);
|
||||
self.end_pos > self.start_pos
|
||||
})();
|
||||
|
||||
if have_something_to_send {
|
||||
if self.end_pos > self.start_pos {
|
||||
// We have something to send.
|
||||
trace!("got end_pos {:?}, streaming", self.end_pos);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Wait for WAL to appear, now self.end_pos == self.start_pos.
|
||||
if let Some(lsn) = self.wait_for_lsn().await? {
|
||||
if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? {
|
||||
self.end_pos = lsn;
|
||||
trace!("got end_pos {:?}, streaming", self.end_pos);
|
||||
return Ok(());
|
||||
@@ -638,54 +628,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait until we have available WAL > start_pos or timeout expires. Returns
|
||||
/// - Ok(Some(end_pos)) if needed lsn is successfully observed;
|
||||
/// - Ok(None) if timeout expired;
|
||||
/// - Err in case of error -- only if 1) term changed while fetching in recovery
|
||||
/// mode 2) watch channel closed, which must never happen.
|
||||
async fn wait_for_lsn(&mut self) -> anyhow::Result<Option<Lsn>> {
|
||||
let fp = (|| {
|
||||
fail::fail_point!(
|
||||
"sk-pause-send",
|
||||
self.appname.as_deref() != Some("pageserver"),
|
||||
|_| { true }
|
||||
);
|
||||
false
|
||||
})();
|
||||
if fp {
|
||||
tokio::time::sleep(POLL_STATE_TIMEOUT).await;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let res = timeout(POLL_STATE_TIMEOUT, async move {
|
||||
loop {
|
||||
let end_pos = self.end_watch.get();
|
||||
if end_pos > self.start_pos {
|
||||
return Ok(end_pos);
|
||||
}
|
||||
if let EndWatch::Flush(rx) = &self.end_watch {
|
||||
let curr_term = rx.borrow().term;
|
||||
if let Some(client_term) = self.term {
|
||||
if curr_term != client_term {
|
||||
bail!("term changed: requested {}, now {}", client_term, curr_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.end_watch.changed().await?;
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
match res {
|
||||
// success
|
||||
Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)),
|
||||
// error inside closure
|
||||
Ok(Err(err)) => Err(err),
|
||||
// timeout
|
||||
Err(_) => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A half driving receiving replies.
|
||||
@@ -743,6 +685,47 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Wait until we have available WAL > start_pos or timeout expires. Returns
|
||||
/// - Ok(Some(end_pos)) if needed lsn is successfully observed;
|
||||
/// - Ok(None) if timeout expired;
|
||||
/// - Err in case of error -- only if 1) term changed while fetching in recovery
|
||||
/// mode 2) watch channel closed, which must never happen.
|
||||
async fn wait_for_lsn(
|
||||
rx: &mut EndWatch,
|
||||
client_term: Option<Term>,
|
||||
start_pos: Lsn,
|
||||
) -> anyhow::Result<Option<Lsn>> {
|
||||
let res = timeout(POLL_STATE_TIMEOUT, async move {
|
||||
loop {
|
||||
let end_pos = rx.get();
|
||||
if end_pos > start_pos {
|
||||
return Ok(end_pos);
|
||||
}
|
||||
if let EndWatch::Flush(rx) = rx {
|
||||
let curr_term = rx.borrow().term;
|
||||
if let Some(client_term) = client_term {
|
||||
if curr_term != client_term {
|
||||
bail!("term changed: requested {}, now {}", client_term, curr_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
rx.changed().await?;
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
match res {
|
||||
// success
|
||||
Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)),
|
||||
// error inside closure
|
||||
Ok(Err(err)) => Err(err),
|
||||
// timeout
|
||||
Err(_) => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -38,12 +38,6 @@ pub trait Storage {
|
||||
/// LSN of last durably stored WAL record.
|
||||
fn flush_lsn(&self) -> Lsn;
|
||||
|
||||
/// Initialize segment by creating proper long header at the beginning of
|
||||
/// the segment and short header at the page of given LSN. This is only used
|
||||
/// for timeline initialization because compute will stream data only since
|
||||
/// init_lsn. Other segment headers are included in compute stream.
|
||||
async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Write piece of WAL from buf to disk, but not necessarily sync it.
|
||||
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
|
||||
@@ -84,8 +78,6 @@ pub struct PhysicalStorage {
|
||||
|
||||
/// Size of WAL segment in bytes.
|
||||
wal_seg_size: usize,
|
||||
pg_version: u32,
|
||||
system_id: u64,
|
||||
|
||||
/// Written to disk, but possibly still in the cache and not fully persisted.
|
||||
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
|
||||
@@ -177,8 +169,6 @@ impl PhysicalStorage {
|
||||
timeline_dir,
|
||||
conf: conf.clone(),
|
||||
wal_seg_size,
|
||||
pg_version: state.server.pg_version,
|
||||
system_id: state.server.system_id,
|
||||
write_lsn,
|
||||
write_record_lsn: write_lsn,
|
||||
flush_record_lsn: flush_lsn,
|
||||
@@ -334,20 +324,6 @@ impl Storage for PhysicalStorage {
|
||||
self.flush_record_lsn
|
||||
}
|
||||
|
||||
async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> {
|
||||
let segno = init_lsn.segment_number(self.wal_seg_size);
|
||||
let (mut file, _) = self.open_or_create(segno).await?;
|
||||
let major_pg_version = self.pg_version / 10000;
|
||||
let wal_seg =
|
||||
postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&wal_seg).await?;
|
||||
file.flush().await?;
|
||||
info!("initialized segno {} at lsn {}", segno, init_lsn);
|
||||
// note: file is *not* fsynced
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write WAL to disk.
|
||||
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
// Disallow any non-sequential writes, which can result in gaps or overwrites.
|
||||
|
||||
@@ -182,10 +182,6 @@ impl wal_storage::Storage for DiskWALStorage {
|
||||
self.flush_record_lsn
|
||||
}
|
||||
|
||||
async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write piece of WAL from buf to disk, but not necessarily sync it.
|
||||
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
if self.write_lsn != startpos {
|
||||
|
||||
@@ -17,7 +17,8 @@ use utils::lsn::Lsn;
|
||||
use walproposer::{
|
||||
api_bindings::Level,
|
||||
bindings::{
|
||||
NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
|
||||
pg_atomic_uint64, NeonWALReadResult, PageserverFeedback, SafekeeperStateDesiredEvents,
|
||||
WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
|
||||
},
|
||||
walproposer::{ApiImpl, Config},
|
||||
};
|
||||
@@ -223,13 +224,31 @@ impl SimulationApi {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let empty_feedback = PageserverFeedback {
|
||||
present: false,
|
||||
currentClusterSize: 0,
|
||||
last_received_lsn: 0,
|
||||
disk_consistent_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
replytime: 0,
|
||||
shard_number: 0,
|
||||
};
|
||||
|
||||
Self {
|
||||
os: args.os,
|
||||
safekeepers: RefCell::new(sk_conns),
|
||||
disk: args.disk,
|
||||
redo_start_lsn: args.redo_start_lsn,
|
||||
last_logged_commit_lsn: 0,
|
||||
shmem: UnsafeCell::new(walproposer::api_bindings::empty_shmem()),
|
||||
shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
|
||||
mutex: 0,
|
||||
mineLastElectedTerm: 0,
|
||||
backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
|
||||
currentClusterSize: pg_atomic_uint64 { value: 0 },
|
||||
shard_ps_feedback: [empty_feedback; 128],
|
||||
num_shards: 0,
|
||||
min_ps_feedback: empty_feedback,
|
||||
}),
|
||||
config: args.config,
|
||||
event_set: RefCell::new(None),
|
||||
}
|
||||
@@ -255,12 +274,6 @@ impl ApiImpl for SimulationApi {
|
||||
self.os.now() as i64 * 1000
|
||||
}
|
||||
|
||||
fn update_donor(&self, donor: &mut walproposer::bindings::Safekeeper, donor_lsn: u64) {
|
||||
let mut shmem = unsafe { *self.get_shmem_state() };
|
||||
shmem.propEpochStartLsn.value = donor_lsn;
|
||||
shmem.donor_conninfo = donor.conninfo;
|
||||
}
|
||||
|
||||
fn conn_status(
|
||||
&self,
|
||||
_: &mut walproposer::bindings::Safekeeper,
|
||||
|
||||
@@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import (
|
||||
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name
|
||||
from fixtures.pageserver.types import IndexPartDump
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
@@ -68,7 +68,7 @@ from fixtures.remote_storage import (
|
||||
RemoteStorageUser,
|
||||
S3Storage,
|
||||
default_remote_storage,
|
||||
remote_storage_to_toml_dict,
|
||||
remote_storage_to_toml_inline_table,
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.utils import are_walreceivers_absent
|
||||
@@ -82,7 +82,6 @@ from fixtures.utils import (
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.utils import AuxFileStore as AuxFileStore # reexport
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
@@ -466,7 +465,6 @@ class NeonEnvBuilder:
|
||||
initial_tenant: Optional[TenantId] = None,
|
||||
initial_timeline: Optional[TimelineId] = None,
|
||||
pageserver_virtual_file_io_engine: Optional[str] = None,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
|
||||
):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
@@ -490,7 +488,6 @@ class NeonEnvBuilder:
|
||||
self.env: Optional[NeonEnv] = None
|
||||
self.keep_remote_storage_contents: bool = True
|
||||
self.neon_binpath = neon_binpath
|
||||
self.neon_local_binpath = neon_binpath
|
||||
self.pg_distrib_dir = pg_distrib_dir
|
||||
self.pg_version = pg_version
|
||||
self.preserve_database_files = preserve_database_files
|
||||
@@ -522,8 +519,6 @@ class NeonEnvBuilder:
|
||||
self.pageserver_validate_vectored_get = bool(validate)
|
||||
log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
|
||||
|
||||
self.pageserver_aux_file_policy = pageserver_aux_file_policy
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
), "Unexpectedly instantiated from outside a test function"
|
||||
@@ -569,7 +564,6 @@ class NeonEnvBuilder:
|
||||
timeline_id=env.initial_timeline,
|
||||
shard_count=initial_tenant_shard_count,
|
||||
shard_stripe_size=initial_tenant_shard_stripe_size,
|
||||
aux_file_v2=self.pageserver_aux_file_policy,
|
||||
)
|
||||
assert env.initial_tenant == initial_tenant
|
||||
assert env.initial_timeline == initial_timeline
|
||||
@@ -638,11 +632,17 @@ class NeonEnvBuilder:
|
||||
def from_repo_dir(
|
||||
self,
|
||||
repo_dir: Path,
|
||||
neon_binpath: Optional[Path] = None,
|
||||
pg_distrib_dir: Optional[Path] = None,
|
||||
) -> NeonEnv:
|
||||
"""
|
||||
A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir.
|
||||
"""
|
||||
|
||||
# Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests
|
||||
self.neon_binpath = neon_binpath or self.neon_binpath
|
||||
self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir
|
||||
|
||||
# Get the initial tenant and timeline from the snapshot config
|
||||
snapshot_config_toml = repo_dir / "config"
|
||||
with snapshot_config_toml.open("r") as f:
|
||||
@@ -982,7 +982,7 @@ class NeonEnv:
|
||||
|
||||
Some notable functions and fields in NeonEnv:
|
||||
|
||||
endpoints - A factory object for creating postgres compute nodes.
|
||||
postgres - A factory object for creating postgres compute nodes.
|
||||
|
||||
pageservers - An array containing objects representing the pageservers
|
||||
|
||||
@@ -1017,10 +1017,9 @@ class NeonEnv:
|
||||
self.pg_version = config.pg_version
|
||||
# Binary path for pageserver, safekeeper, etc
|
||||
self.neon_binpath = config.neon_binpath
|
||||
# Binary path for neon_local test-specific binaries
|
||||
self.neon_local_binpath = config.neon_local_binpath
|
||||
if self.neon_local_binpath is None:
|
||||
self.neon_local_binpath = self.neon_binpath
|
||||
# Binary path for neon_local test-specific binaries: may be overridden
|
||||
# after construction for compat testing
|
||||
self.neon_local_binpath = config.neon_binpath
|
||||
self.pg_distrib_dir = config.pg_distrib_dir
|
||||
self.endpoint_counter = 0
|
||||
self.storage_controller_config = config.storage_controller_config
|
||||
@@ -1052,7 +1051,6 @@ class NeonEnv:
|
||||
)
|
||||
|
||||
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
|
||||
self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
|
||||
|
||||
# Create a config file corresponding to the options
|
||||
cfg: Dict[str, Any] = {
|
||||
@@ -1289,7 +1287,6 @@ def _shared_simple_env(
|
||||
pg_distrib_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore],
|
||||
) -> Iterator[NeonEnv]:
|
||||
"""
|
||||
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
|
||||
@@ -1320,7 +1317,6 @@ def _shared_simple_env(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
) as builder:
|
||||
env = builder.init_start()
|
||||
|
||||
@@ -1360,7 +1356,6 @@ def neon_env_builder(
|
||||
test_overlay_dir: Path,
|
||||
top_output_dir: Path,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
@@ -1394,7 +1389,6 @@ def neon_env_builder(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
test_overlay_dir=test_overlay_dir,
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
) as builder:
|
||||
yield builder
|
||||
|
||||
@@ -1554,7 +1548,6 @@ class NeonCli(AbstractNeonCli):
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
placement_policy: Optional[str] = None,
|
||||
set_default: bool = False,
|
||||
aux_file_v2: Optional[AuxFileStore] = None,
|
||||
) -> Tuple[TenantId, TimelineId]:
|
||||
"""
|
||||
Creates a new tenant, returns its id and its initial timeline's id.
|
||||
@@ -1578,16 +1571,6 @@ class NeonCli(AbstractNeonCli):
|
||||
product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
|
||||
)
|
||||
)
|
||||
|
||||
if aux_file_v2 is AuxFileStore.V2:
|
||||
args.extend(["-c", "switch_aux_file_policy:v2"])
|
||||
|
||||
if aux_file_v2 is AuxFileStore.V1:
|
||||
args.extend(["-c", "switch_aux_file_policy:v1"])
|
||||
|
||||
if aux_file_v2 is AuxFileStore.CrossValidation:
|
||||
args.extend(["-c", "switch_aux_file_policy:cross_validation"])
|
||||
|
||||
if set_default:
|
||||
args.append("--set-default")
|
||||
|
||||
@@ -1730,7 +1713,8 @@ class NeonCli(AbstractNeonCli):
|
||||
|
||||
ps_config = {}
|
||||
if remote_storage is not None:
|
||||
ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage)
|
||||
remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
|
||||
ps_config["remote_storage"] = remote_storage_toml_table
|
||||
|
||||
if pageserver_config_override is not None:
|
||||
for o in pageserver_config_override.split(";"):
|
||||
@@ -1832,7 +1816,6 @@ class NeonCli(AbstractNeonCli):
|
||||
hot_standby: bool = False,
|
||||
lsn: Optional[Lsn] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
allow_multiple=False,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -1856,8 +1839,6 @@ class NeonCli(AbstractNeonCli):
|
||||
args.extend(["--hot-standby", "true"])
|
||||
if pageserver_id is not None:
|
||||
args.extend(["--pageserver-id", str(pageserver_id)])
|
||||
if allow_multiple:
|
||||
args.extend(["--allow-multiple"])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
@@ -1869,7 +1850,6 @@ class NeonCli(AbstractNeonCli):
|
||||
safekeepers: Optional[List[int]] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
allow_multiple=False,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -1884,8 +1864,6 @@ class NeonCli(AbstractNeonCli):
|
||||
args.append(endpoint_id)
|
||||
if pageserver_id is not None:
|
||||
args.extend(["--pageserver-id", str(pageserver_id)])
|
||||
if allow_multiple:
|
||||
args.extend(["--allow-multiple"])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
@@ -2449,16 +2427,20 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
def config_toml_path(self) -> Path:
|
||||
return self.workdir / "pageserver.toml"
|
||||
|
||||
def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
|
||||
def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], bool]):
|
||||
"""
|
||||
Edit the pageserver's config toml file in place.
|
||||
|
||||
The `edit_fn` is to manipulate the dict, and if it returns True, the file will be written.
|
||||
If it returns False, no changes are made to the file system.
|
||||
"""
|
||||
path = self.config_toml_path
|
||||
with open(path, "r") as f:
|
||||
config = toml.load(f)
|
||||
edit_fn(config)
|
||||
with open(path, "w") as f:
|
||||
toml.dump(config, f)
|
||||
save = edit_fn(config)
|
||||
if save:
|
||||
with open(path, "w") as f:
|
||||
toml.dump(config, f)
|
||||
|
||||
def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -2469,12 +2451,13 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
"""
|
||||
replacements = {}
|
||||
|
||||
def doit(config: Dict[str, Any]):
|
||||
def doit(config: Dict[str, Any]) -> bool:
|
||||
while len(patch) > 0:
|
||||
key, new = patch.popitem()
|
||||
old = config.get(key, None)
|
||||
config[key] = new
|
||||
replacements[key] = old
|
||||
return True
|
||||
|
||||
self.edit_config_toml(doit)
|
||||
return replacements
|
||||
@@ -2652,37 +2635,6 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
)
|
||||
|
||||
def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
|
||||
"""
|
||||
Inspect local storage on a pageserver to discover which layer files are present.
|
||||
|
||||
:return: list of relative paths to layers, from the timeline root.
|
||||
"""
|
||||
timeline_path = self.timeline_dir(tenant_id, timeline_id)
|
||||
|
||||
def relative(p: Path) -> Path:
|
||||
return p.relative_to(timeline_path)
|
||||
|
||||
return sorted(
|
||||
list(
|
||||
map(
|
||||
relative,
|
||||
filter(
|
||||
lambda path: path.name != "metadata"
|
||||
and "ephemeral" not in path.name
|
||||
and "temp" not in path.name,
|
||||
timeline_path.glob("*"),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def layer_exists(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName
|
||||
) -> bool:
|
||||
layers = self.list_layers(tenant_id, timeline_id)
|
||||
return layer_name in [parse_layer_file_name(p.name) for p in layers]
|
||||
|
||||
|
||||
class PgBin:
|
||||
"""A helper class for executing postgres binaries"""
|
||||
@@ -3371,7 +3323,6 @@ class Endpoint(PgProtocol):
|
||||
lsn: Optional[Lsn] = None,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
allow_multiple: bool = False,
|
||||
) -> "Endpoint":
|
||||
"""
|
||||
Create a new Postgres endpoint.
|
||||
@@ -3394,7 +3345,6 @@ class Endpoint(PgProtocol):
|
||||
pg_port=self.pg_port,
|
||||
http_port=self.http_port,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
)
|
||||
path = Path("endpoints") / self.endpoint_id / "pgdata"
|
||||
self.pgdata_dir = os.path.join(self.env.repo_dir, path)
|
||||
@@ -3411,10 +3361,7 @@ class Endpoint(PgProtocol):
|
||||
return self
|
||||
|
||||
def start(
|
||||
self,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
allow_multiple: bool = False,
|
||||
self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None
|
||||
) -> "Endpoint":
|
||||
"""
|
||||
Start the Postgres instance.
|
||||
@@ -3430,7 +3377,6 @@ class Endpoint(PgProtocol):
|
||||
safekeepers=self.active_safekeepers,
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
)
|
||||
self.running = True
|
||||
|
||||
@@ -3560,7 +3506,6 @@ class Endpoint(PgProtocol):
|
||||
config_lines: Optional[List[str]] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
allow_multiple=False,
|
||||
) -> "Endpoint":
|
||||
"""
|
||||
Create an endpoint, apply config, and start Postgres.
|
||||
@@ -3576,12 +3521,7 @@ class Endpoint(PgProtocol):
|
||||
hot_standby=hot_standby,
|
||||
lsn=lsn,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
).start(
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
)
|
||||
).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id)
|
||||
|
||||
log.info(f"Postgres startup took {time.time() - started_at} seconds")
|
||||
|
||||
|
||||
@@ -89,8 +89,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
# During teardown, we stop the storage controller before the pageservers, so pageservers
|
||||
# can experience connection errors doing background deletion queue work.
|
||||
".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*",
|
||||
# Can happen when the test shuts down the storage controller while it is calling the utilization API
|
||||
".*WARN.*path=/v1/utilization .*request was dropped before completing",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -819,23 +819,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
continue
|
||||
self.download_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
def detach_ancestor(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
batch_size: int | None = None,
|
||||
) -> Set[TimelineId]:
|
||||
params = {}
|
||||
if batch_size is not None:
|
||||
params["batch_size"] = batch_size
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
|
||||
params=params,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
json = res.json()
|
||||
return set(map(TimelineId, json["reparented_timelines"]))
|
||||
|
||||
def evict_layer(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
|
||||
):
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Tuple, Union
|
||||
|
||||
@@ -48,36 +47,46 @@ class InvalidFileName(Exception):
|
||||
pass
|
||||
|
||||
|
||||
IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$")
|
||||
|
||||
|
||||
def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
|
||||
"""Parse an image layer file name. Return key start, key end, and snapshot lsn"""
|
||||
|
||||
match = IMAGE_LAYER_FILE_NAME.match(f_name)
|
||||
if match is None:
|
||||
raise InvalidFileName(f"'{f_name}' is not an image layer filename")
|
||||
|
||||
return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16)
|
||||
|
||||
|
||||
DELTA_LAYER_FILE_NAME = re.compile(
|
||||
"^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$"
|
||||
)
|
||||
parts = f_name.split("__")
|
||||
if len(parts) != 2:
|
||||
raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
|
||||
key_parts = parts[0].split("-")
|
||||
if len(key_parts) != 2:
|
||||
raise InvalidFileName(
|
||||
f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
|
||||
)
|
||||
try:
|
||||
return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16)
|
||||
except ValueError as e:
|
||||
raise InvalidFileName(f"conversion error: {f_name}") from e
|
||||
|
||||
|
||||
def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
|
||||
"""Parse a delta layer file name. Return key start, key end, lsn start, and lsn end"""
|
||||
match = DELTA_LAYER_FILE_NAME.match(f_name)
|
||||
if match is None:
|
||||
raise InvalidFileName(f"'{f_name}' is not an delta layer filename")
|
||||
|
||||
return (
|
||||
int(match.group(1), 16),
|
||||
int(match.group(2), 16),
|
||||
int(match.group(3), 16),
|
||||
int(match.group(4), 16),
|
||||
)
|
||||
parts = f_name.split("__")
|
||||
if len(parts) != 2:
|
||||
raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
|
||||
key_parts = parts[0].split("-")
|
||||
if len(key_parts) != 2:
|
||||
raise InvalidFileName(
|
||||
f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
|
||||
)
|
||||
lsn_parts = parts[1].split("-")
|
||||
if len(lsn_parts) != 2:
|
||||
raise InvalidFileName(
|
||||
f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}"
|
||||
)
|
||||
try:
|
||||
return (
|
||||
int(key_parts[0], 16),
|
||||
int(key_parts[1], 16),
|
||||
int(lsn_parts[0], 16),
|
||||
int(lsn_parts[1], 16),
|
||||
)
|
||||
except ValueError as e:
|
||||
raise InvalidFileName(f"conversion error: {f_name}") from e
|
||||
|
||||
|
||||
def parse_layer_file_name(file_name: str) -> LayerFileName:
|
||||
|
||||
@@ -5,7 +5,6 @@ import pytest
|
||||
from _pytest.python import Metafunc
|
||||
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import AuxFileStore
|
||||
|
||||
"""
|
||||
Dynamically parametrize tests by different parameters
|
||||
@@ -32,11 +31,6 @@ def pageserver_virtual_file_io_engine() -> Optional[str]:
|
||||
return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
|
||||
return None
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc: Metafunc):
|
||||
if (bt := os.getenv("BUILD_TYPE")) is None:
|
||||
build_types = ["debug", "release"]
|
||||
|
||||
@@ -141,13 +141,11 @@ class LocalFsStorage:
|
||||
with self.heatmap_path(tenant_id).open("r") as f:
|
||||
return json.load(f)
|
||||
|
||||
def to_toml_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
def to_toml_inline_table(self) -> str:
|
||||
rv = {
|
||||
"local_path": str(self.root),
|
||||
}
|
||||
|
||||
def to_toml_inline_table(self) -> str:
|
||||
return toml.TomlEncoder().dump_inline_table(self.to_toml_dict())
|
||||
return toml.TomlEncoder().dump_inline_table(rv)
|
||||
|
||||
def cleanup(self):
|
||||
# no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files
|
||||
@@ -196,7 +194,7 @@ class S3Storage:
|
||||
}
|
||||
)
|
||||
|
||||
def to_toml_dict(self) -> Dict[str, Any]:
|
||||
def to_toml_inline_table(self) -> str:
|
||||
rv = {
|
||||
"bucket_name": self.bucket_name,
|
||||
"bucket_region": self.bucket_region,
|
||||
@@ -208,10 +206,7 @@ class S3Storage:
|
||||
if self.endpoint is not None:
|
||||
rv["endpoint"] = self.endpoint
|
||||
|
||||
return rv
|
||||
|
||||
def to_toml_inline_table(self) -> str:
|
||||
return toml.TomlEncoder().dump_inline_table(self.to_toml_dict())
|
||||
return toml.TomlEncoder().dump_inline_table(rv)
|
||||
|
||||
def do_cleanup(self):
|
||||
if not self.cleanup:
|
||||
@@ -419,13 +414,6 @@ def default_remote_storage() -> RemoteStorageKind:
|
||||
return RemoteStorageKind.LOCAL_FS
|
||||
|
||||
|
||||
def remote_storage_to_toml_dict(remote_storage: RemoteStorage) -> Dict[str, Any]:
|
||||
if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
|
||||
raise Exception("invalid remote storage type")
|
||||
|
||||
return remote_storage.to_toml_dict()
|
||||
|
||||
|
||||
# serialize as toml inline table
|
||||
def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
|
||||
if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import contextlib
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -485,16 +484,3 @@ def assert_no_errors(log_file, service, allowed_errors):
|
||||
log.info(f"not allowed {service} error: {error.strip()}")
|
||||
|
||||
assert not errors, f"Log errors on {service}: {errors[0]}"
|
||||
|
||||
|
||||
@enum.unique
|
||||
class AuxFileStore(str, enum.Enum):
|
||||
V1 = "V1"
|
||||
V2 = "V2"
|
||||
CrossValidation = "CrossValidation"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
@@ -141,10 +141,9 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
|
||||
# start without gc so we can time compaction with less noise; use shorter
|
||||
# period for compaction so it starts earlier
|
||||
def patch_default_tenant_config(config):
|
||||
tenant_config = config.get("tenant_config", {})
|
||||
tenant_config["compaction_period"] = "3s"
|
||||
tenant_config["gc_period"] = "0s"
|
||||
config["tenant_config"] = tenant_config
|
||||
config["compaction_period"] = "3s"
|
||||
config["gc_period"] = "0s"
|
||||
return True
|
||||
|
||||
env.pageserver.edit_config_toml(patch_default_tenant_config)
|
||||
env.pageserver.start(
|
||||
|
||||
@@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"trace_read_requests": True,
|
||||
"walreceiver_connect_timeout": "13m",
|
||||
"image_layer_creation_check_threshold": 1,
|
||||
"switch_aux_file_policy": "CrossValidation",
|
||||
"switch_to_aux_file_v2": True,
|
||||
}
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
@@ -233,18 +233,17 @@ def test_forward_compatibility(
|
||||
neon_env_builder.pageserver_validate_vectored_get = None
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
# Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
|
||||
# But always use the current version's neon_local binary.
|
||||
# This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI.
|
||||
neon_env_builder.neon_binpath = compatibility_neon_bin
|
||||
neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir
|
||||
neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath
|
||||
|
||||
neon_local_binpath = neon_env_builder.neon_binpath
|
||||
env = neon_env_builder.from_repo_dir(
|
||||
compatibility_snapshot_dir / "repo",
|
||||
neon_binpath=compatibility_neon_bin,
|
||||
pg_distrib_dir=compatibility_postgres_distrib_dir,
|
||||
)
|
||||
|
||||
# Use current neon_local even though we're using old binaries for
|
||||
# everything else: our test code is written for latest CLI args.
|
||||
env.neon_local_binpath = neon_local_binpath
|
||||
|
||||
neon_env_builder.start()
|
||||
|
||||
check_neon_works(
|
||||
|
||||
@@ -19,12 +19,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
|
||||
def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
|
||||
env = neon_env_builder.init_start()
|
||||
env.neon_cli.create_branch("test_crafted_wal_end")
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# seems like pageserver stop triggers these
|
||||
".*initial size calculation failed.*Bad state (not active).*",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create("test_crafted_wal_end")
|
||||
wal_craft = WalCraft(env)
|
||||
|
||||
@@ -47,9 +47,8 @@ def test_min_resident_size_override_handling(
|
||||
if config_level_override is not None:
|
||||
|
||||
def set_min_resident_size(config):
|
||||
tenant_config = config.get("tenant_config", {})
|
||||
tenant_config["min_resident_size_override"] = config_level_override
|
||||
config["tenant_config"] = tenant_config
|
||||
config["tenant_config"] = {"min_resident_size": config_level_override}
|
||||
return True
|
||||
|
||||
env.pageserver.edit_config_toml(set_min_resident_size)
|
||||
env.pageserver.stop()
|
||||
|
||||
@@ -2,7 +2,6 @@ import time
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload_queue_empty,
|
||||
@@ -87,7 +86,14 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
|
||||
# path = env.remote_storage.timeline_path(tenant_id, timeline_id)
|
||||
l1_found = None
|
||||
for path in env.pageserver.list_layers(tenant_id, timeline_id):
|
||||
for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir():
|
||||
if path.name == "metadata" or path.name.startswith("ephemeral-"):
|
||||
continue
|
||||
|
||||
if len(path.suffixes) > 0:
|
||||
# temp files
|
||||
continue
|
||||
|
||||
[key_range, lsn_range] = path.name.split("__", maxsplit=1)
|
||||
|
||||
if "-" not in lsn_range:
|
||||
@@ -102,21 +108,19 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
|
||||
if l1_found is not None:
|
||||
raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}")
|
||||
l1_found = parse_layer_file_name(path.name)
|
||||
l1_found = path
|
||||
|
||||
assert l1_found is not None, "failed to find L1 locally"
|
||||
|
||||
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, l1_found.to_str()
|
||||
tenant_id, timeline_id, l1_found.name
|
||||
)
|
||||
assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
|
||||
|
||||
env.pageserver.start()
|
||||
wait_until_tenant_active(pageserver_http, tenant_id)
|
||||
|
||||
assert not env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, l1_found
|
||||
), "partial compaction result should had been removed during startup"
|
||||
assert not l1_found.exists(), "partial compaction result should had been removed during startup"
|
||||
|
||||
# wait for us to catch up again
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
|
||||
@@ -126,18 +130,18 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
# give time for log flush
|
||||
time.sleep(1)
|
||||
|
||||
message = f".*duplicated L1 layer layer={l1_found}"
|
||||
message = f".*duplicated L1 layer layer={l1_found.name}"
|
||||
found_msg = env.pageserver.log_contains(message)
|
||||
# resident or evicted, it should not be overwritten, however it should had been non-existing at startup
|
||||
assert (
|
||||
found_msg is None
|
||||
), "layer should had been removed during startup, did it live on as evicted?"
|
||||
|
||||
assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"
|
||||
assert l1_found.exists(), "the L1 reappears"
|
||||
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
|
||||
|
||||
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, l1_found.to_str()
|
||||
tenant_id, timeline_id, l1_found.name
|
||||
)
|
||||
assert uploaded.exists(), "the L1 is uploaded"
|
||||
|
||||
@@ -7,7 +7,6 @@ from fixtures.neon_fixtures import (
|
||||
flush_ep_to_pageserver,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import wait_for_upload
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
|
||||
@@ -58,9 +57,9 @@ def test_basic_eviction(
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
initial_local_layers = dict(
|
||||
(parse_layer_file_name(path.name), path)
|
||||
for path in env.pageserver.list_layers(tenant_id, timeline_id)
|
||||
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
initial_local_layers = sorted(
|
||||
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
)
|
||||
assert (
|
||||
len(initial_local_layers) > 1
|
||||
@@ -74,7 +73,6 @@ def test_basic_eviction(
|
||||
assert len(initial_local_layers) == len(
|
||||
initial_layer_map_info.historic_layers
|
||||
), "Should have the same layers in memory and on disk"
|
||||
|
||||
for returned_layer in initial_layer_map_info.historic_layers:
|
||||
assert (
|
||||
returned_layer.kind == "Delta"
|
||||
@@ -83,29 +81,27 @@ def test_basic_eviction(
|
||||
not returned_layer.remote
|
||||
), f"All created layers should be present locally, but got {returned_layer}"
|
||||
|
||||
returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name)
|
||||
assert (
|
||||
returned_layer_name in initial_local_layers
|
||||
), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}"
|
||||
|
||||
local_layer_path = (
|
||||
env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
/ initial_local_layers[returned_layer_name]
|
||||
local_layers = list(
|
||||
filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers)
|
||||
)
|
||||
assert (
|
||||
returned_layer.layer_file_size == local_layer_path.stat().st_size
|
||||
), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}"
|
||||
len(local_layers) == 1
|
||||
), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
|
||||
local_layer = local_layers[0]
|
||||
assert (
|
||||
returned_layer.layer_file_size == local_layer.stat().st_size
|
||||
), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}"
|
||||
|
||||
# Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map
|
||||
for local_layer_name, local_layer_path in initial_local_layers.items():
|
||||
for local_layer in initial_local_layers:
|
||||
client.evict_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
|
||||
)
|
||||
assert not env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, local_layer_name
|
||||
), f"Did not expect to find {local_layer_name} layer after evicting"
|
||||
assert not any(
|
||||
new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*")
|
||||
), f"Did not expect to find {local_layer} layer after evicting"
|
||||
|
||||
empty_layers = env.pageserver.list_layers(tenant_id, timeline_id)
|
||||
empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
|
||||
|
||||
evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
|
||||
@@ -122,15 +118,15 @@ def test_basic_eviction(
|
||||
assert (
|
||||
returned_layer.remote
|
||||
), f"All layers should be evicted and not present locally, but got {returned_layer}"
|
||||
returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name)
|
||||
assert (
|
||||
returned_layer_name in initial_local_layers
|
||||
assert any(
|
||||
local_layer.name == returned_layer.layer_file_name
|
||||
for local_layer in initial_local_layers
|
||||
), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}"
|
||||
|
||||
# redownload all evicted layers and ensure the initial state is restored
|
||||
for local_layer_name, _local_layer_path in initial_local_layers.items():
|
||||
for local_layer in initial_local_layers:
|
||||
client.download_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str()
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name
|
||||
)
|
||||
client.timeline_download_remote_layers(
|
||||
tenant_id,
|
||||
@@ -141,9 +137,8 @@ def test_basic_eviction(
|
||||
at_least_one_download=False,
|
||||
)
|
||||
|
||||
redownloaded_layers = dict(
|
||||
(parse_layer_file_name(path.name), path)
|
||||
for path in env.pageserver.list_layers(tenant_id, timeline_id)
|
||||
redownloaded_layers = sorted(
|
||||
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
)
|
||||
assert (
|
||||
redownloaded_layers == initial_local_layers
|
||||
|
||||
@@ -6,9 +6,7 @@ from string import ascii_lowercase
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
AuxFileStore,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
logical_replication_sync,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
@@ -20,19 +18,6 @@ def random_string(n: int):
|
||||
return "".join([choice(ascii_lowercase) for _ in range(n)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
|
||||
env = neon_simple_env
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(env.initial_tenant).effective_config
|
||||
assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -174,9 +159,6 @@ COMMIT;
|
||||
|
||||
|
||||
# Test that neon.logical_replication_max_snap_files works
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
def slot_removed(ep):
|
||||
assert (
|
||||
@@ -221,86 +203,8 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
|
||||
|
||||
|
||||
# Tests that walsender correctly blocks until WAL is downloaded from safekeepers
|
||||
def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch("init")
|
||||
endpoint = env.endpoints.create_start("init")
|
||||
|
||||
with endpoint.connect().cursor() as cur:
|
||||
cur.execute("create table wal_generator (id serial primary key, data text)")
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO wal_generator (data)
|
||||
SELECT repeat('A', 1024) -- Generates a kilobyte of data per row
|
||||
FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data
|
||||
"""
|
||||
)
|
||||
cur.execute("create table t(a int)")
|
||||
cur.execute("create publication pub for table t")
|
||||
cur.execute("insert into t values (1)")
|
||||
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create table t(a int)")
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
vanilla_pg.stop()
|
||||
|
||||
# Pause the safekeepers so that they can't send WAL (except to pageserver)
|
||||
for sk in env.safekeepers:
|
||||
sk_http = sk.http_client()
|
||||
sk_http.configure_failpoints([("sk-pause-send", "return")])
|
||||
|
||||
# Insert a 2
|
||||
with endpoint.connect().cursor() as cur:
|
||||
cur.execute("insert into t values (2)")
|
||||
|
||||
endpoint.stop_and_destroy()
|
||||
|
||||
# This new endpoint should contain [1, 2], but it can't access WAL from safekeeper
|
||||
endpoint = env.endpoints.create_start("init")
|
||||
with endpoint.connect().cursor() as cur:
|
||||
cur.execute("select * from t")
|
||||
res = [r[0] for r in cur.fetchall()]
|
||||
assert res == [1, 2]
|
||||
|
||||
# Reconnect subscriber
|
||||
vanilla_pg.start()
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
vanilla_pg.safe_psql(f"alter subscription sub1 connection '{connstr}'")
|
||||
|
||||
time.sleep(5)
|
||||
# Make sure the 2 isn't replicated
|
||||
assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1]
|
||||
|
||||
# Re-enable WAL download
|
||||
for sk in env.safekeepers:
|
||||
sk_http = sk.http_client()
|
||||
sk_http.configure_failpoints([("sk-pause-send", "off")])
|
||||
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2]
|
||||
|
||||
# Check that local reads also work
|
||||
with endpoint.connect().cursor() as cur:
|
||||
cur.execute("insert into t values (3)")
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3]
|
||||
|
||||
log_path = vanilla_pg.pgdatadir / "pg.log"
|
||||
with open(log_path, "r") as log_file:
|
||||
logs = log_file.read()
|
||||
assert "could not receive data from WAL stream" not in logs
|
||||
|
||||
|
||||
# Test compute start at LSN page of which starts with contrecord
|
||||
# https://github.com/neondatabase/neon/issues/5749
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -391,9 +295,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
# logical replication bug as such, but without logical replication,
|
||||
# records passed ot the WAL redo process are never large enough to hit
|
||||
# the bug.
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -465,9 +366,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
|
||||
ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_replication_shutdown(neon_simple_env: NeonEnv):
|
||||
# Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -119,11 +119,11 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
cur = endpoint_main.connect().cursor()
|
||||
# Create table, and insert rows, each in a separate transaction
|
||||
# Enable synchronous commit as we are timing sensitive
|
||||
# Disable synchronous_commit to make this initialization go faster.
|
||||
#
|
||||
# Each row contains current insert LSN and the current timestamp, when
|
||||
# the row was inserted.
|
||||
cur.execute("SET synchronous_commit=on")
|
||||
cur.execute("SET synchronous_commit=off")
|
||||
cur.execute("CREATE TABLE foo (x integer)")
|
||||
tbl = []
|
||||
for i in range(1000):
|
||||
@@ -132,7 +132,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
|
||||
after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc)
|
||||
after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()")
|
||||
tbl.append([i, after_timestamp, after_lsn])
|
||||
time.sleep(0.02)
|
||||
time.sleep(0.005)
|
||||
|
||||
# Execute one more transaction with synchronous_commit enabled, to flush
|
||||
# all the previous transactions
|
||||
|
||||
@@ -10,7 +10,6 @@ of the pageserver are:
|
||||
"""
|
||||
|
||||
import enum
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
@@ -701,50 +700,3 @@ def test_multi_attach(
|
||||
|
||||
# All data we wrote while multi-attached remains readable
|
||||
workload.validate(pageservers[2].id)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="To be enabled after release with new local path style")
|
||||
def test_upgrade_generationless_local_file_paths(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test pageserver behavior when startup up with local layer paths without
|
||||
generation numbers: it should accept these layer files, and avoid doing
|
||||
a delete/download cycle on them.
|
||||
"""
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(1000)
|
||||
|
||||
env.pageserver.stop()
|
||||
|
||||
# Rename the local paths to legacy format, to simulate what
|
||||
# we would see when upgrading
|
||||
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
files_renamed = 0
|
||||
for filename in os.listdir(timeline_dir):
|
||||
path = os.path.join(timeline_dir, filename)
|
||||
log.info(f"Found file {path}")
|
||||
if path.endswith("-00000001"):
|
||||
new_path = path[:-9]
|
||||
os.rename(path, new_path)
|
||||
log.info(f"Renamed {path} -> {new_path}")
|
||||
files_renamed += 1
|
||||
|
||||
assert files_renamed > 0
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
workload.validate()
|
||||
|
||||
# Assert that there were no on-demand downloads
|
||||
assert (
|
||||
env.pageserver.http_client().get_metric_value(
|
||||
"pageserver_remote_ondemand_downloaded_layers_total"
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
@@ -20,10 +20,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"})
|
||||
== 1
|
||||
)
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
cur = pg_conn.cursor()
|
||||
@@ -58,10 +55,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.start()
|
||||
|
||||
# We reloaded our tenant
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"})
|
||||
== 1
|
||||
)
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
cur.execute("SELECT count(*) FROM foo")
|
||||
assert cur.fetchone() == (100000,)
|
||||
|
||||
@@ -2,12 +2,12 @@ import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
@@ -51,13 +51,9 @@ def evict_random_layers(
|
||||
if "ephemeral" in layer.name or "temp_download" in layer.name:
|
||||
continue
|
||||
|
||||
layer_name = parse_layer_file_name(layer.name)
|
||||
|
||||
if rng.choice([True, False]):
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}")
|
||||
client.evict_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str()
|
||||
)
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
|
||||
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", [1, 2, 3])
|
||||
@@ -406,6 +402,32 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
validate_heatmap(heatmap_second)
|
||||
|
||||
|
||||
def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
|
||||
"""
|
||||
Inspect local storage on a pageserver to discover which layer files are present.
|
||||
|
||||
:return: list of relative paths to layers, from the timeline root.
|
||||
"""
|
||||
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
|
||||
def relative(p: Path) -> Path:
|
||||
return p.relative_to(timeline_path)
|
||||
|
||||
return sorted(
|
||||
list(
|
||||
map(
|
||||
relative,
|
||||
filter(
|
||||
lambda path: path.name != "metadata"
|
||||
and "ephemeral" not in path.name
|
||||
and "temp" not in path.name,
|
||||
timeline_path.glob("*"),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the overall data flow in secondary mode:
|
||||
@@ -460,8 +482,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
# Make changes on attached pageserver, check secondary downloads them
|
||||
@@ -478,8 +500,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
try:
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
except:
|
||||
# Do a full listing of the secondary location on errors, to help debug of
|
||||
@@ -501,8 +523,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# ==================================================================
|
||||
try:
|
||||
log.info("Evicting a layer...")
|
||||
layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
|
||||
some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
|
||||
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
|
||||
some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
|
||||
log.info(f"Victim layer: {layer_to_evict.name}")
|
||||
ps_attached.http_client().evict_layer(
|
||||
tenant_id, timeline_id, layer_name=layer_to_evict.name
|
||||
@@ -515,13 +537,13 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
|
||||
)
|
||||
assert layer_to_evict.name not in heatmap_layers
|
||||
assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers
|
||||
assert some_other_layer.name in heatmap_layers
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
except:
|
||||
# On assertion failures, log some details to help with debugging
|
||||
@@ -608,7 +630,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
for timeline_id in timelines:
|
||||
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
|
||||
# One or more layers should be present for all timelines
|
||||
assert ps_secondary.list_layers(tenant_id, timeline_id)
|
||||
assert list_layers(ps_secondary, tenant_id, timeline_id)
|
||||
|
||||
# Delete the second timeline: this should be reflected later on the secondary
|
||||
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
|
||||
@@ -623,10 +645,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
|
||||
|
||||
# This one was not deleted
|
||||
assert ps_secondary.list_layers(tenant_id, timelines[0])
|
||||
assert list_layers(ps_secondary, tenant_id, timelines[0])
|
||||
|
||||
# This one was deleted
|
||||
assert not ps_secondary.list_layers(tenant_id, timelines[1])
|
||||
assert not list_layers(ps_secondary, tenant_id, timelines[1])
|
||||
|
||||
t_end = time.time()
|
||||
|
||||
@@ -686,7 +708,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
|
||||
ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Expect lots of layers
|
||||
assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
|
||||
assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10
|
||||
|
||||
# Simulate large data by making layer downloads artifically slow
|
||||
for ps in env.pageservers:
|
||||
|
||||
@@ -1,28 +1,13 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin
|
||||
from fixtures.utils import subprocess_capture
|
||||
|
||||
|
||||
def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir):
|
||||
# use special --ignore option to ignore the validation checks in pg_waldump
|
||||
# this is necessary, because neon WAL files contain gap at the beginning
|
||||
output_path, _, _ = subprocess_capture(
|
||||
test_output_dir, [pg_waldump_path, "--ignore", segment_path]
|
||||
)
|
||||
|
||||
with open(f"{output_path}.stdout", "r") as f:
|
||||
stdout = f.read()
|
||||
assert "ABORT" in stdout
|
||||
assert "COMMIT" in stdout
|
||||
|
||||
|
||||
# Simple test to check that pg_waldump works with neon WAL files
|
||||
def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
|
||||
env = neon_simple_env
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty")
|
||||
env.neon_cli.create_branch("test_pg_waldump", "empty")
|
||||
endpoint = env.endpoints.create_start("test_pg_waldump")
|
||||
|
||||
cur = endpoint.connect().cursor()
|
||||
@@ -50,12 +35,12 @@ def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
|
||||
assert endpoint.pgdata_dir
|
||||
wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001")
|
||||
pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump")
|
||||
# check segment on compute
|
||||
check_wal_segment(pg_waldump_path, wal_path, test_output_dir)
|
||||
|
||||
# Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix.
|
||||
sk = env.safekeepers[0]
|
||||
sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id)
|
||||
non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001")
|
||||
shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path)
|
||||
check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir)
|
||||
# use special --ignore option to ignore the validation checks in pg_waldump
|
||||
# this is necessary, because neon WAL files contain gap at the beginning
|
||||
output_path, _, _ = subprocess_capture(test_output_dir, [pg_waldump_path, "--ignore", wal_path])
|
||||
|
||||
with open(f"{output_path}.stdout", "r") as f:
|
||||
stdout = f.read()
|
||||
assert "ABORT" in stdout
|
||||
assert "COMMIT" in stdout
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from fixtures.neon_fixtures import PgBin
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
|
||||
"""Test that Postgres version matches the one we expect"""
|
||||
|
||||
with (base_dir / "vendor" / "revisions.json").open() as f:
|
||||
expected_revisions = json.load(f)
|
||||
|
||||
output_prefix = pg_bin.run_capture(["postgres", "--version"], with_command_header=False)
|
||||
stdout = Path(f"{output_prefix}.stdout")
|
||||
assert stdout.exists(), "postgres --version didn't print anything to stdout"
|
||||
|
||||
with stdout.open() as f:
|
||||
output = f.read().strip()
|
||||
|
||||
# `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)".
|
||||
pattern = r"postgres \(PostgreSQL\) (?P<version>\d+\.\d+) \((?P<commit>[0-9a-f]{40})\)"
|
||||
match = re.search(pattern, output, re.IGNORECASE)
|
||||
assert match is not None, f"Can't parse {output} with {pattern}"
|
||||
|
||||
version = match.group("version")
|
||||
commit = match.group("commit")
|
||||
|
||||
assert (
|
||||
pg_version.v_prefixed in expected_revisions
|
||||
), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
|
||||
|
||||
msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
|
||||
assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
|
||||
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_last_record_lsn,
|
||||
@@ -830,9 +829,8 @@ def test_compaction_waits_for_upload(
|
||||
assert len(upload_stuck_layers) > 0
|
||||
|
||||
for name in upload_stuck_layers:
|
||||
assert env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
), "while uploads are stuck the layers should be present on disk"
|
||||
path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
|
||||
assert path.exists(), "while uploads are stuck the layers should be present on disk"
|
||||
|
||||
# now this will do the L0 => L1 compaction and want to remove
|
||||
# upload_stuck_layers and the original initdb L0
|
||||
@@ -840,9 +838,8 @@ def test_compaction_waits_for_upload(
|
||||
|
||||
# as uploads are paused, the upload_stuck_layers should still be with us
|
||||
for name in upload_stuck_layers:
|
||||
assert env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
), "uploads are stuck still over compaction"
|
||||
path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
|
||||
assert path.exists(), "uploads are stuck still over compaction"
|
||||
|
||||
compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
|
||||
overlap = compacted_layers.intersection(upload_stuck_layers)
|
||||
@@ -876,8 +873,9 @@ def test_compaction_waits_for_upload(
|
||||
wait_until(10, 1, until_layer_deletes_completed)
|
||||
|
||||
for name in upload_stuck_layers:
|
||||
assert not env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name
|
||||
assert (
|
||||
not path.exists()
|
||||
), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
|
||||
|
||||
# We should not have hit the error handling path in uploads where a uploaded file is gone
|
||||
|
||||
@@ -47,7 +47,7 @@ def test_tenant_s3_restore(
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
# Default tenant and the one we created
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
# create two timelines one being the parent of another, both with non-trivial data
|
||||
parent = None
|
||||
@@ -72,13 +72,13 @@ def test_tenant_s3_restore(
|
||||
time.sleep(4)
|
||||
|
||||
assert (
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
), "tenant removed before we deletion was issued"
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert (
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
|
||||
), "tenant removed before we deletion was issued"
|
||||
env.storage_controller.attach_hook_drop(tenant_id)
|
||||
|
||||
@@ -116,4 +116,4 @@ def test_tenant_s3_restore(
|
||||
# There might be some activity that advances the lsn so we can't use a strict equality check
|
||||
assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old"
|
||||
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
@@ -177,67 +177,6 @@ def test_sharding_split_unsharded(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that after a split, we clean up parent layer data in the child shards via compaction.
|
||||
"""
|
||||
TENANT_CONF = {
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
"pitr_interval": "3600s",
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly, so that GC can remove some layers
|
||||
"image_creation_threshold": "1",
|
||||
"image_layer_creation_check_threshold": "0",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Check that we created with an unsharded TenantShardId: this is the default,
|
||||
# but check it in case we change the default in future
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id, branch_name="main")
|
||||
workload.init()
|
||||
workload.write_rows(256)
|
||||
workload.validate()
|
||||
workload.stop()
|
||||
|
||||
# Split one shard into two
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
|
||||
|
||||
# Check we got the shard IDs we expected
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
|
||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
|
||||
|
||||
workload.validate()
|
||||
workload.stop()
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
# Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
|
||||
# Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes
|
||||
detail_before = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
ps.http_client().timeline_compact(shard, timeline_id)
|
||||
detail_after = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
|
||||
# Physical size should shrink because some layers have been dropped
|
||||
assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
|
||||
|
||||
# Compaction shouldn't make anything unreadable
|
||||
workload.validate()
|
||||
|
||||
|
||||
def test_sharding_split_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
|
||||
@@ -64,7 +64,7 @@ def test_tenant_delete_smoke(
|
||||
)
|
||||
|
||||
# Default tenant and the one we created
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
|
||||
|
||||
# create two timelines one being the parent of another
|
||||
parent = None
|
||||
@@ -90,9 +90,9 @@ def test_tenant_delete_smoke(
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
tenant_path = env.pageserver.tenant_dir(tenant_id)
|
||||
assert not tenant_path.exists()
|
||||
@@ -108,7 +108,7 @@ def test_tenant_delete_smoke(
|
||||
)
|
||||
|
||||
# Deletion updates the tenant count: the one default tenant remains
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
|
||||
class Check(enum.Enum):
|
||||
@@ -532,9 +532,7 @@ def test_tenant_delete_concurrent(
|
||||
|
||||
# The TenantSlot is still present while the original request is hung before
|
||||
# final removal
|
||||
assert (
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
# Permit the original request to run to success
|
||||
ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
|
||||
@@ -558,8 +556,7 @@ def test_tenant_delete_concurrent(
|
||||
)
|
||||
|
||||
# Zero tenants remain (we deleted the default tenant)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
|
||||
|
||||
|
||||
def test_tenant_delete_races_timeline_creation(
|
||||
@@ -676,7 +673,7 @@ def test_tenant_delete_races_timeline_creation(
|
||||
)
|
||||
|
||||
# Zero tenants remain (we deleted the default tenant)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
|
||||
|
||||
|
||||
def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -668,9 +668,9 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
client.configure_failpoints((failpoint, "off"))
|
||||
|
||||
# accept both, because the deletion might still complete before
|
||||
matcher = "(Failed to refresh gc_info before gathering inputs|NotFound: tenant)"
|
||||
with pytest.raises(PageserverApiException, match=matcher):
|
||||
with pytest.raises(
|
||||
PageserverApiException, match="Failed to refresh gc_info before gathering inputs"
|
||||
):
|
||||
completion.result()
|
||||
|
||||
# this happens on both cases
|
||||
|
||||
@@ -18,7 +18,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
from fixtures.pageserver.types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
wait_for_last_record_lsn,
|
||||
@@ -247,10 +246,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
|
||||
# ensure the same size is found from the index_part.json
|
||||
index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id)
|
||||
assert (
|
||||
index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"]
|
||||
== expected_size
|
||||
)
|
||||
assert index_part["layer_metadata"][path.name]["file_size"] == expected_size
|
||||
|
||||
## Start the pageserver. It will notice that the file size doesn't match, and
|
||||
## rename away the local file. It will be re-downloaded when it's needed.
|
||||
@@ -280,7 +276,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
|
||||
# the remote side of local_layer_truncated
|
||||
remote_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, parse_layer_file_name(path.name).to_str()
|
||||
tenant_id, timeline_id, path.name
|
||||
)
|
||||
|
||||
# if the upload ever was ongoing, this check would be racy, but at least one
|
||||
|
||||
@@ -1,403 +0,0 @@
|
||||
import enum
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from queue import Empty, Queue
|
||||
from threading import Barrier
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import HistoricLayerInfo
|
||||
from fixtures.pageserver.utils import wait_timeline_detail_404
|
||||
from fixtures.types import Lsn, TimelineId
|
||||
|
||||
|
||||
def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
|
||||
assert info.lsn_end is not None
|
||||
return Lsn(info.lsn_end)
|
||||
|
||||
|
||||
def layer_name(info: HistoricLayerInfo) -> str:
|
||||
return info.layer_file_name
|
||||
|
||||
|
||||
@enum.unique
|
||||
class Branchpoint(str, enum.Enum):
|
||||
"""
|
||||
Have branches at these Lsns possibly relative to L0 layer boundary.
|
||||
"""
|
||||
|
||||
EARLIER = "earlier"
|
||||
AT_L0 = "at"
|
||||
AFTER_L0 = "after"
|
||||
LAST_RECORD_LSN = "head"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.value
|
||||
|
||||
@staticmethod
|
||||
def all() -> List["Branchpoint"]:
|
||||
return [
|
||||
Branchpoint.EARLIER,
|
||||
Branchpoint.AT_L0,
|
||||
Branchpoint.AFTER_L0,
|
||||
Branchpoint.LAST_RECORD_LSN,
|
||||
]
|
||||
|
||||
|
||||
SHUTDOWN_ALLOWED_ERRORS = [
|
||||
".*initial size calculation failed: downloading failed, possibly for shutdown",
|
||||
".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("branchpoint", Branchpoint.all())
|
||||
@pytest.mark.parametrize("restart_after", [True, False])
|
||||
def test_ancestor_detach_branched_from(
|
||||
neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool
|
||||
):
|
||||
"""
|
||||
Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached.
|
||||
"""
|
||||
# TODO: parametrize; currently unimplemented over at pageserver
|
||||
write_to_branch_first = True
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
ep.safe_psql("CREATE TABLE foo (i BIGINT);")
|
||||
|
||||
after_first_tx = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
|
||||
|
||||
# create a single layer for us to remote copy
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers()
|
||||
# there is also the in-mem layer, but ignore it for now
|
||||
assert len(deltas) == 2, "expecting there to be two deltas: initdb and checkpointed"
|
||||
later_delta = max(deltas, key=by_end_lsn)
|
||||
assert later_delta.lsn_end is not None
|
||||
|
||||
# -1 as the lsn_end is exclusive.
|
||||
last_lsn = Lsn(later_delta.lsn_end).lsn_int - 1
|
||||
|
||||
if branchpoint == Branchpoint.EARLIER:
|
||||
branch_at = after_first_tx
|
||||
rows = 0
|
||||
truncated_layers = 1
|
||||
elif branchpoint == Branchpoint.AT_L0:
|
||||
branch_at = Lsn(last_lsn)
|
||||
rows = 8192
|
||||
truncated_layers = 0
|
||||
elif branchpoint == Branchpoint.AFTER_L0:
|
||||
branch_at = Lsn(last_lsn + 8)
|
||||
rows = 8192
|
||||
# as there is no 8 byte walrecord, nothing should get copied from the straddling layer
|
||||
truncated_layers = 0
|
||||
else:
|
||||
# this case also covers the implicit flush of ancestor as the inmemory hasn't been flushed yet
|
||||
assert branchpoint == Branchpoint.LAST_RECORD_LSN
|
||||
branch_at = None
|
||||
rows = 16384
|
||||
truncated_layers = 0
|
||||
|
||||
name = "new main"
|
||||
|
||||
timeline_id = env.neon_cli.create_branch(
|
||||
name, "main", env.initial_tenant, ancestor_start_lsn=branch_at
|
||||
)
|
||||
|
||||
recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"])
|
||||
if branch_at is None:
|
||||
# fix it up if we need it later (currently unused)
|
||||
branch_at = recorded
|
||||
else:
|
||||
assert branch_at == recorded, "the test should not use unaligned lsns"
|
||||
|
||||
if write_to_branch_first:
|
||||
with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
# make sure the ep is writable
|
||||
# with BEFORE_L0, AFTER_L0 there will be a gap in Lsns caused by accurate end_lsn on straddling layers
|
||||
ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;")
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
|
||||
|
||||
# branch must have a flush for "PREV_LSN: none"
|
||||
client.timeline_checkpoint(env.initial_tenant, timeline_id)
|
||||
branch_layers = set(
|
||||
map(layer_name, client.layer_map_info(env.initial_tenant, timeline_id).historic_layers)
|
||||
)
|
||||
else:
|
||||
branch_layers = set()
|
||||
|
||||
all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
|
||||
assert all_reparented == set()
|
||||
|
||||
if restart_after:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 16384
|
||||
|
||||
with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
|
||||
old_main_info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
|
||||
old_main = set(map(layer_name, old_main_info.historic_layers))
|
||||
|
||||
new_main_info = client.layer_map_info(env.initial_tenant, timeline_id)
|
||||
new_main = set(map(layer_name, new_main_info.historic_layers))
|
||||
|
||||
new_main_copied_or_truncated = new_main - branch_layers
|
||||
new_main_truncated = new_main_copied_or_truncated - old_main
|
||||
|
||||
assert len(new_main_truncated) == truncated_layers
|
||||
# could additionally check that the symmetric difference has layers starting at the same lsn
|
||||
# but if nothing was copied, then there is no nice rule.
|
||||
# there could be a hole in LSNs between copied from the "old main" and the first branch layer.
|
||||
|
||||
client.timeline_delete(env.initial_tenant, env.initial_timeline)
|
||||
wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("restart_after", [True, False])
|
||||
def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool):
|
||||
"""
|
||||
The case from RFC:
|
||||
|
||||
+-> another branch with same ancestor_lsn as new main
|
||||
|
|
||||
old main -------|---------X--------->
|
||||
| | |
|
||||
| | +-> after
|
||||
| |
|
||||
| +-> new main
|
||||
|
|
||||
+-> reparented
|
||||
|
||||
Ends up as:
|
||||
|
||||
old main --------------------------->
|
||||
|
|
||||
+-> after
|
||||
|
||||
+-> another branch with same ancestor_lsn as new main
|
||||
|
|
||||
new main -------|---------|->
|
||||
|
|
||||
+-> reparented
|
||||
|
||||
We confirm the end result by being able to delete "old main" after deleting "after".
|
||||
"""
|
||||
|
||||
# TODO: support not yet implemented for these
|
||||
write_to_branch_first = True
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
ep.safe_psql("CREATE TABLE foo (i BIGINT);")
|
||||
ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;")
|
||||
|
||||
branchpoint_pipe = wait_for_last_flush_lsn(
|
||||
env, ep, env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
|
||||
|
||||
branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);")
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# as this only gets reparented, we don't need to write to it like new main
|
||||
reparented = env.neon_cli.create_branch(
|
||||
"reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe
|
||||
)
|
||||
|
||||
same_branchpoint = env.neon_cli.create_branch(
|
||||
"same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x
|
||||
)
|
||||
|
||||
timeline_id = env.neon_cli.create_branch(
|
||||
"new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x
|
||||
)
|
||||
|
||||
after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
|
||||
|
||||
if write_to_branch_first:
|
||||
with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192
|
||||
with ep.cursor() as cur:
|
||||
cur.execute("UPDATE audit SET starts = starts + 1")
|
||||
assert cur.rowcount == 1
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
|
||||
|
||||
client.timeline_checkpoint(env.initial_tenant, timeline_id)
|
||||
|
||||
all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
|
||||
assert all_reparented == {reparented, same_branchpoint}
|
||||
|
||||
if restart_after:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
env.pageserver.quiesce_tenants()
|
||||
|
||||
# checking the ancestor after is much faster than waiting for the endpoint not start
|
||||
expected_result = [
|
||||
("main", env.initial_timeline, None, 16384, 1),
|
||||
("after", after, env.initial_timeline, 16384, 1),
|
||||
("new main", timeline_id, None, 8192, 2),
|
||||
("same_branchpoint", same_branchpoint, timeline_id, 8192, 1),
|
||||
("reparented", reparented, timeline_id, 0, 1),
|
||||
]
|
||||
|
||||
for _, timeline_id, expected_ancestor, _, _ in expected_result:
|
||||
details = client.timeline_detail(env.initial_tenant, timeline_id)
|
||||
ancestor_timeline_id = details["ancestor_timeline_id"]
|
||||
if expected_ancestor is None:
|
||||
assert ancestor_timeline_id is None
|
||||
else:
|
||||
assert TimelineId(ancestor_timeline_id) == expected_ancestor
|
||||
|
||||
for name, _, _, rows, starts in expected_result:
|
||||
with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1
|
||||
|
||||
# delete the timelines to confirm detach actually worked
|
||||
client.timeline_delete(env.initial_tenant, after)
|
||||
wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0)
|
||||
|
||||
client.timeline_delete(env.initial_tenant, env.initial_timeline)
|
||||
wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("restart_after", [True, False])
|
||||
def test_detached_receives_flushes_while_being_detached(
|
||||
neon_env_builder: NeonEnvBuilder, restart_after: bool
|
||||
):
|
||||
"""
|
||||
Makes sure that the timeline is able to receive writes through-out the detach process.
|
||||
"""
|
||||
write_to_branch_first = True
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
# row counts have been manually verified to cause reconnections and getpage
|
||||
# requests when restart_after=False with pg16
|
||||
def insert_rows(n: int, ep) -> int:
|
||||
ep.safe_psql(
|
||||
f"INSERT INTO foo SELECT i::bigint, 'more info!! this is a long string' || i FROM generate_series(0, {n - 1}) g(i);"
|
||||
)
|
||||
return n
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE foo (i BIGINT, aux TEXT NOT NULL);")
|
||||
|
||||
rows = insert_rows(256, ep)
|
||||
|
||||
branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
timeline_id = env.neon_cli.create_branch(
|
||||
"new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint
|
||||
)
|
||||
|
||||
log.info("starting the new main endpoint")
|
||||
ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant)
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
|
||||
if write_to_branch_first:
|
||||
rows += insert_rows(256, ep)
|
||||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
|
||||
client.timeline_checkpoint(env.initial_tenant, timeline_id)
|
||||
log.info("completed {write_to_branch_first=}")
|
||||
|
||||
def small_txs(ep, queue: Queue[str], barrier):
|
||||
extra_rows = 0
|
||||
|
||||
with ep.connect() as conn:
|
||||
while True:
|
||||
try:
|
||||
queue.get_nowait()
|
||||
break
|
||||
except Empty:
|
||||
pass
|
||||
|
||||
if barrier is not None:
|
||||
barrier.wait()
|
||||
barrier = None
|
||||
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT INTO foo(i, aux) VALUES (1, 'more info!! this is a long string' || 1);"
|
||||
)
|
||||
extra_rows += 1
|
||||
return extra_rows
|
||||
|
||||
with ThreadPoolExecutor(max_workers=1) as exec:
|
||||
queue: Queue[str] = Queue()
|
||||
barrier = Barrier(2)
|
||||
|
||||
completion = exec.submit(small_txs, ep, queue, barrier)
|
||||
barrier.wait()
|
||||
|
||||
reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
|
||||
assert len(reparented) == 0
|
||||
|
||||
if restart_after:
|
||||
# ep and row production is kept alive on purpose
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
env.pageserver.quiesce_tenants()
|
||||
|
||||
queue.put("done")
|
||||
extra_rows = completion.result()
|
||||
assert extra_rows > 0, "some rows should had been written"
|
||||
rows += extra_rows
|
||||
|
||||
assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None
|
||||
|
||||
assert ep.safe_psql("SELECT clear_buffer_cache();")
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
|
||||
ep.stop()
|
||||
|
||||
# finally restart the endpoint and make sure we still have the same answer
|
||||
with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
|
||||
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
|
||||
# TODO:
|
||||
# - after starting the operation, tenant is deleted
|
||||
# - after starting the operation, pageserver is shutdown, restarted
|
||||
# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
|
||||
# - deletion of reparented while reparenting should fail once, then succeed (?)
|
||||
# - branch near existing L1 boundary, image layers?
|
||||
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user