Compare commits

..

1 Commits

Author SHA1 Message Date
John Spray
ed2be78861 pageserver: drop layers after shard split in GC 2024-04-30 11:29:14 +01:00
45 changed files with 513 additions and 2069 deletions

23
Cargo.lock generated
View File

@@ -1319,7 +1319,6 @@ dependencies = [
"git-version",
"hex",
"humantime",
"humantime-serde",
"hyper 0.14.26",
"nix 0.27.1",
"once_cell",
@@ -3185,16 +3184,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -3531,12 +3520,6 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -5112,11 +5095,8 @@ dependencies = [
"hex",
"histogram",
"itertools",
"native-tls",
"pageserver",
"pageserver_api",
"postgres-native-tls",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest",
@@ -5125,10 +5105,8 @@ dependencies = [
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-appender",
"tracing-subscriber",
@@ -6529,7 +6507,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",

View File

@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.20.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"
urlencoding = "2.1"

View File

@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux
PG_CONFIGURE_OPTS += --with-libseccomp
else ifeq ($(UNAME_S),Darwin)
ifndef DISABLE_HOMEBREW
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
# Use -C option so that when PostgreSQL "make install" installs the

View File

@@ -51,7 +51,6 @@ use tracing::{error, info};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -69,29 +68,6 @@ use compute_tools::spec::*;
const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
let (build_tag, clap_args) = init()?;
let (pg_handle, start_pg_result) =
{
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_result = process_cli(&clap_args)?;
let wait_spec_result = wait_spec(build_tag, cli_result)?;
start_postgres(&clap_args, wait_spec_result)?
// Startup is finished, exit the startup tracing context
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
cleanup_and_exit(start_pg_result, wait_pg_result)
}
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -106,11 +82,35 @@ fn init() -> Result<(String, clap::ArgMatches)> {
.to_string();
info!("build_tag: {build_tag}");
Ok((build_tag, cli().get_matches()))
}
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
let connstr = matches
.get_one::<String>("connstr")
.expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
{
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -147,7 +147,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
@@ -157,42 +157,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
Some(guard)
} else {
None
}
}
};
fn process_cli(
matches: &clap::ArgMatches,
) -> Result<ProcessCliResult> {
let pgbin_default = "postgres";
let pgbin = matches
.get_one::<String>("pgbin")
.map(|s| s.as_str())
.unwrap_or(pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
let connstr = matches
.get_one::<String>("connstr")
.expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
@@ -233,45 +199,6 @@ fn process_cli(
}
};
let result = ProcessCliResult {
// directly from CLI:
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
// others:
spec,
live_config_allowed,
};
Ok(result)
}
struct ProcessCliResult<'clap> {
connstr: &'clap str,
pgdata: &'clap str,
pgbin: &'clap str,
ext_remote_storage: Option<&'clap str>,
http_port: u16,
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
live_config_allowed: bool,
}
fn wait_spec(
build_tag: String,
ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
spec,
live_config_allowed,
}: ProcessCliResult,
) -> Result<WaitSpecResult> {
let mut new_state = ComputeState::new();
let spec_set;
@@ -310,6 +237,8 @@ fn wait_spec(
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
@@ -326,19 +255,6 @@ fn wait_spec(
}
}
Ok(WaitSpecResult { compute, http_port })
}
struct WaitSpecResult {
compute: Arc<ComputeNode>,
// passed through from ProcessCliResult
http_port: u16,
}
fn start_postgres(
matches: &clap::ArgMatches,
WaitSpecResult { compute, http_port }: WaitSpecResult,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
@@ -365,10 +281,9 @@ fn start_postgres(
let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute);
let extension_server_port: u16 = http_port;
// Start Postgres
let mut delay_exit = false;
let mut exit_code = None;
let pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
Err(err) => {
@@ -419,7 +334,7 @@ fn start_postgres(
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
let vm_monitor = rt.as_ref().map(|rt| {
let vm_monitor = &rt.as_ref().map(|rt| {
rt.spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(),
@@ -432,43 +347,12 @@ fn start_postgres(
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
rt,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
rt: Option<tokio::runtime::Runtime>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(
pg: Option<PostgresHandle>,
) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
@@ -483,26 +367,6 @@ fn wait_postgres(
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_and_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
rt,
}: StartPostgresResult,
WaitPostgresResult { exit_code }: WaitPostgresResult,
) -> Result<()> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.

View File

@@ -17,7 +17,6 @@ nix.workspace = true
once_cell.workspace = true
postgres.workspace = true
hex.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }

View File

@@ -1554,8 +1554,8 @@ fn cli() -> Command {
Command::new("storage_controller")
.arg_required_else_help(true)
.about("Manage storage_controller")
.subcommand(Command::new("start").about("Start storage controller"))
.subcommand(Command::new("stop").about("Stop storage controller")
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
.subcommand(Command::new("stop").about("Stop local pageserver")
.arg(stop_mode_arg.clone()))
)
.subcommand(

View File

@@ -17,7 +17,6 @@ use std::net::Ipv4Addr;
use std::net::SocketAddr;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::time::Duration;
use utils::{
auth::{encode_from_key_file, Claims},
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -67,10 +66,6 @@ pub struct LocalEnv {
pub broker: NeonBroker,
// Configuration for the storage controller (1 per neon_local environment)
#[serde(default)]
pub storage_controller: NeonStorageControllerConf,
/// This Vec must always contain at least one pageserver
pub pageservers: Vec<PageServerConf>,
@@ -103,29 +98,6 @@ pub struct NeonBroker {
pub listen_addr: SocketAddr,
}
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct NeonStorageControllerConf {
/// Heartbeat timeout before marking a node offline
#[serde(with = "humantime_serde")]
pub max_unavailable: Duration,
}
impl NeonStorageControllerConf {
// Use a shorter pageserver unavailability interval than the default to speed up tests.
const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
std::time::Duration::from_secs(10);
}
impl Default for NeonStorageControllerConf {
fn default() -> Self {
Self {
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
}
}
}
// Dummy Default impl to satisfy Deserialize derive.
impl Default for NeonBroker {
fn default() -> Self {

View File

@@ -1,7 +1,4 @@
use crate::{
background_process,
local_env::{LocalEnv, NeonStorageControllerConf},
};
use crate::{background_process, local_env::LocalEnv};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Method;
use pageserver_api::{
@@ -35,13 +32,15 @@ pub struct StorageController {
public_key: Option<String>,
postgres_port: u16,
client: reqwest::Client,
config: NeonStorageControllerConf,
}
const COMMAND: &str = "storage_controller";
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
// Use a shorter pageserver unavailability interval than the default to speed up tests.
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
config: env.storage_controller.clone(),
}
}
@@ -274,6 +272,8 @@ impl StorageController {
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
let mut args = vec![
"-l",
&self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
"--database-url",
&database_url,
"--max-unavailable-interval",
&humantime::Duration::from(self.config.max_unavailable).to_string(),
&max_unavailable.to_string(),
]
.into_iter()
.map(|s| s.to_string())

View File

@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
rebuilt on startup.
The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The `diesel` crate is used for defining models & migrations.

View File

@@ -17,10 +17,6 @@ pub struct KeySpace {
pub ranges: Vec<Range<Key>>,
}
/// A wrapper type for sparse keyspaces.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct SparseKeySpace(pub KeySpace);
/// Represents a contiguous half-open range of the keyspace, masked according to a particular
/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
/// shard.
@@ -439,33 +435,10 @@ pub struct KeyPartitioning {
pub parts: Vec<KeySpace>,
}
/// Represents a partitioning of the sparse key space.
#[derive(Clone, Debug, Default)]
pub struct SparseKeyPartitioning {
pub parts: Vec<SparseKeySpace>,
}
impl KeyPartitioning {
pub fn new() -> Self {
KeyPartitioning { parts: Vec::new() }
}
/// Convert a key partitioning to a sparse partition.
pub fn into_sparse(self) -> SparseKeyPartitioning {
SparseKeyPartitioning {
parts: self.parts.into_iter().map(SparseKeySpace).collect(),
}
}
}
impl SparseKeyPartitioning {
/// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
/// cause long/dead loops.
pub fn into_dense(self) -> KeyPartitioning {
KeyPartitioning {
parts: self.parts.into_iter().map(|x| x.0).collect(),
}
}
}
///

View File

@@ -1,11 +1,9 @@
use utils::lsn::Lsn;
use crate::keyspace::SparseKeySpace;
#[derive(Debug, PartialEq, Eq)]
pub struct Partitioning {
pub keys: crate::keyspace::KeySpace,
pub sparse_keys: crate::keyspace::SparseKeySpace,
pub at_lsn: Lsn,
}
@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("sparse_keys")?;
map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
#[derive(serde::Deserialize)]
struct De {
keys: KeySpace,
sparse_keys: KeySpace,
#[serde_as(as = "serde_with::DisplayFromStr")]
at_lsn: Lsn,
}
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
Ok(Self {
at_lsn: de.at_lsn,
keys: de.keys.0,
sparse_keys: SparseKeySpace(de.sparse_keys.0),
})
}
}
@@ -139,12 +133,6 @@ mod tests {
"030000000000000000000000000000000003"
]
],
"sparse_keys": [
[
"620000000000000000000000000000000000",
"620000000000000000000000000000000003"
]
],
"at_lsn": "0/2240160"
}
"#;

View File

@@ -2,10 +2,11 @@
use std::cmp::{Eq, Ordering};
use std::collections::BinaryHeap;
use std::fmt::Debug;
use std::mem;
use std::sync::Mutex;
use std::time::Duration;
use tokio::sync::watch::{self, channel};
use tokio::sync::watch::{channel, Receiver, Sender};
use tokio::time::timeout;
/// An error happened while waiting for a number
@@ -34,73 +35,23 @@ pub trait MonotonicCounter<V> {
fn cnt_value(&self) -> V;
}
/// Heap of waiters, lowest numbers pop first.
struct Waiters<V>
/// Internal components of a `SeqWait`
struct SeqWaitInt<S, V>
where
S: MonotonicCounter<V>,
V: Ord,
{
heap: BinaryHeap<Waiter<V>>,
/// Number of the first waiter in the heap, or None if there are no waiters.
status_channel: watch::Sender<Option<V>>,
}
impl<V> Waiters<V>
where
V: Ord + Copy,
{
fn new() -> Self {
Waiters {
heap: BinaryHeap::new(),
status_channel: channel(None).0,
}
}
/// `status_channel` contains the number of the first waiter in the heap.
/// This function should be called whenever waiters heap changes.
fn update_status(&self) {
let first_waiter = self.heap.peek().map(|w| w.wake_num);
let _ = self.status_channel.send_replace(first_waiter);
}
/// Add new waiter to the heap, return a channel that will be notified when the number arrives.
fn add(&mut self, num: V) -> watch::Receiver<()> {
let (tx, rx) = channel(());
self.heap.push(Waiter {
wake_num: num,
wake_channel: tx,
});
self.update_status();
rx
}
/// Pop all waiters <= num from the heap. Collect channels in a vector,
/// so that caller can wake them up.
fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
let mut wake_these = Vec::new();
while let Some(n) = self.heap.peek() {
if n.wake_num > num {
break;
}
wake_these.push(self.heap.pop().unwrap().wake_channel);
}
self.update_status();
wake_these
}
/// Used on shutdown to efficiently drop all waiters.
fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
let heap = mem::take(&mut self.heap);
self.update_status();
heap
}
waiters: BinaryHeap<Waiter<V>>,
current: S,
shutdown: bool,
}
struct Waiter<T>
where
T: Ord,
{
wake_num: T, // wake me when this number arrives ...
wake_channel: watch::Sender<()>, // ... by sending a message to this channel
wake_num: T, // wake me when this number arrives ...
wake_channel: Sender<()>, // ... by sending a message to this channel
}
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -125,17 +76,6 @@ impl<T: Ord> PartialEq for Waiter<T> {
impl<T: Ord> Eq for Waiter<T> {}
/// Internal components of a `SeqWait`
struct SeqWaitInt<S, V>
where
S: MonotonicCounter<V>,
V: Ord,
{
waiters: Waiters<V>,
current: S,
shutdown: bool,
}
/// A tool for waiting on a sequence number
///
/// This provides a way to wait the arrival of a number.
@@ -168,7 +108,7 @@ where
/// Create a new `SeqWait`, initialized to a particular number
pub fn new(starting_num: S) -> Self {
let internal = SeqWaitInt {
waiters: Waiters::new(),
waiters: BinaryHeap::new(),
current: starting_num,
shutdown: false,
};
@@ -188,8 +128,9 @@ where
// Block any future waiters from starting
internal.shutdown = true;
// Take all waiters to drop them later.
internal.waiters.take_all()
// This will steal the entire waiters map.
// When we drop it all waiters will be woken.
mem::take(&mut internal.waiters)
// Drop the lock as we exit this scope.
};
@@ -255,7 +196,7 @@ where
/// Register and return a channel that will be notified when a number arrives,
/// or None, if it has already arrived.
fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
let mut internal = self.internal.lock().unwrap();
if internal.current.cnt_value() >= num {
return Ok(None);
@@ -264,8 +205,12 @@ where
return Err(SeqWaitError::Shutdown);
}
// Add waiter channel to the queue.
let rx = internal.waiters.add(num);
// Create a new channel.
let (tx, rx) = channel(());
internal.waiters.push(Waiter {
wake_num: num,
wake_channel: tx,
});
// Drop the lock as we exit this scope.
Ok(Some(rx))
}
@@ -286,8 +231,16 @@ where
}
internal.current.cnt_advance(num);
// Pop all waiters <= num from the heap.
internal.waiters.pop_leq(num)
// Pop all waiters <= num from the heap. Collect them in a vector, and
// wake them up after releasing the lock.
let mut wake_these = Vec::new();
while let Some(n) = internal.waiters.peek() {
if n.wake_num > num {
break;
}
wake_these.push(internal.waiters.pop().unwrap().wake_channel);
}
wake_these
};
for tx in wake_these {
@@ -302,23 +255,6 @@ where
pub fn load(&self) -> S {
self.internal.lock().unwrap().current
}
/// Get a Receiver for the current status.
///
/// The current status is the number of the first waiter in the queue,
/// or None if there are no waiters.
///
/// This receiver will be notified whenever the status changes.
/// It is useful for receiving notifications when the first waiter
/// starts waiting for a number, or when there are no more waiters left.
pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
self.internal
.lock()
.unwrap()
.waiters
.status_channel
.subscribe()
}
}
#[cfg(test)]

View File

@@ -10,7 +10,7 @@
//! This module is responsible for creation of such tarball
//! from data stored in object storage.
//!
use anyhow::{anyhow, Context};
use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,14 +38,6 @@ use postgres_ffi::PG_TLI;
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
use utils::lsn::Lsn;
#[derive(Debug, thiserror::Error)]
pub enum BasebackupError {
#[error("basebackup pageserver error {0:#}")]
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#}")]
Client(#[source] io::Error),
}
/// Create basebackup with non-rel data in it.
/// Only include relational data if 'full_backup' is true.
///
@@ -61,7 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
prev_lsn: Option<Lsn>,
full_backup: bool,
ctx: &'a RequestContext,
) -> Result<(), BasebackupError>
) -> anyhow::Result<()>
where
W: AsyncWrite + Send + Sync + Unpin,
{
@@ -100,10 +92,8 @@ where
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
return Err(BasebackupError::Server(anyhow!(
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
)));
if backup_prev != Lsn(0) {
ensure!(backup_prev == provided_prev_lsn);
}
provided_prev_lsn
} else {
@@ -169,26 +159,15 @@ where
}
}
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
let (kind, segno, _) = key_to_slru_block(*key)?;
match kind {
SlruKind::Clog => {
if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
return Err(BasebackupError::Server(anyhow!(
"invalid SlruKind::Clog record: block.len()={}",
block.len()
)));
}
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
}
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
if block.len() != BLCKSZ as usize {
return Err(BasebackupError::Server(anyhow!(
"invalid {:?} record: block.len()={}",
kind,
block.len()
)));
}
ensure!(block.len() == BLCKSZ as usize);
}
}
@@ -215,15 +194,12 @@ where
Ok(())
}
async fn flush(&mut self) -> Result<(), BasebackupError> {
async fn flush(&mut self) -> anyhow::Result<()> {
let nblocks = self.buf.len() / BLCKSZ as usize;
let (kind, segno) = self.current_segment.take().unwrap();
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
let header = new_tar_header(&segname, self.buf.len() as u64)?;
self.ar
.append(&header, self.buf.as_slice())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, self.buf.as_slice()).await?;
self.total_blocks += nblocks;
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -233,7 +209,7 @@ where
Ok(())
}
async fn finish(mut self) -> Result<(), BasebackupError> {
async fn finish(mut self) -> anyhow::Result<()> {
let res = if self.current_segment.is_none() || self.buf.is_empty() {
Ok(())
} else {
@@ -250,7 +226,7 @@ impl<'a, W> Basebackup<'a, W>
where
W: AsyncWrite + Send + Sync + Unpin,
{
async fn send_tarball(mut self) -> Result<(), BasebackupError> {
async fn send_tarball(mut self) -> anyhow::Result<()> {
// TODO include checksum
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -286,8 +262,7 @@ where
let slru_partitions = self
.timeline
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
.partition(
self.timeline.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -296,15 +271,10 @@ where
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
for part in slru_partitions.parts {
let blocks = self
.timeline
.get_vectored(part, self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
for (key, block) in blocks {
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
slru_builder.add_block(&key, block).await?;
slru_builder.add_block(&key, block?).await?;
}
}
slru_builder.finish().await?;
@@ -312,11 +282,8 @@ where
let mut min_restart_lsn: Lsn = Lsn::MAX;
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in self
.timeline
.list_dbdirs(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
for ((spcnode, dbnode), has_relmap_file) in
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
@@ -325,8 +292,7 @@ where
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
// contents of UNLOGGED relations. Postgres copies it in
@@ -349,12 +315,7 @@ where
}
}
for (path, content) in self
.timeline
.list_aux_files(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
{
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
if path.starts_with("pg_replslot") {
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
let restart_lsn = Lsn(u64::from_le_bytes(
@@ -385,41 +346,34 @@ where
for xid in self
.timeline
.list_twophase_files(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
{
self.add_twophase_file(xid).await?;
}
fail_point!("basebackup-before-control-file", |_| {
Err(BasebackupError::Server(anyhow!(
"failpoint basebackup-before-control-file"
)))
bail!("failpoint basebackup-before-control-file")
});
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file().await?;
self.ar.finish().await.map_err(BasebackupError::Client)?;
self.ar.finish().await?;
debug!("all tarred up!");
Ok(())
}
/// Add contents of relfilenode `src`, naming it as `dst`.
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
let file_name = dst.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar
.append(&header, &mut io::empty())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &mut io::empty()).await?;
return Ok(());
}
@@ -434,17 +388,13 @@ where
let img = self
.timeline
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
segment_data.extend_from_slice(&img[..]);
}
let file_name = dst.to_segfile_name(seg as u32);
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
self.ar
.append(&header, segment_data.as_slice())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, segment_data.as_slice()).await?;
seg += 1;
startblk = endblk;
@@ -464,22 +414,20 @@ where
spcnode: u32,
dbnode: u32,
has_relmap_file: bool,
) -> Result<(), BasebackupError> {
) -> anyhow::Result<()> {
let relmap_img = if has_relmap_file {
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
if img.len()
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
{
return Err(BasebackupError::Server(anyhow!(
"img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
img.len(),
)));
}
ensure!(
img.len()
== dispatch_pgversion!(
self.timeline.pg_version,
pgv::bindings::SIZEOF_RELMAPFILE
)
);
Some(img)
} else {
@@ -492,20 +440,14 @@ where
ver => format!("{ver}\x0A"),
};
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
info!("timeline.pg_version {}", self.timeline.pg_version);
if let Some(img) = relmap_img {
// filenode map for global tablespace
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
self.ar
.append(&header, &img[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &img[..]).await?;
} else {
warn!("global/pg_filenode.map is missing");
}
@@ -524,26 +466,18 @@ where
&& self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
.is_empty()
{
return Ok(());
}
// User defined tablespaces are not supported
if spcnode != DEFAULTTABLESPACE_OID {
return Err(BasebackupError::Server(anyhow!(
"spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
)));
}
ensure!(spcnode == DEFAULTTABLESPACE_OID);
// Append dir path for each database
let path = format!("base/{}", dbnode);
let header = new_tar_header_dir(&path)?;
self.ar
.append(&header, &mut io::empty())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &mut io::empty()).await?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -553,17 +487,11 @@ where
ver => format!("{ver}\x0A"),
};
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar
.append(&header, &img[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &img[..]).await?;
}
};
Ok(())
@@ -572,12 +500,11 @@ where
//
// Extract twophase state files
//
async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = self
.timeline
.get_twophase_file(xid, self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
@@ -585,10 +512,7 @@ where
buf.put_u32_le(crc);
let path = format!("pg_twophase/{:>08X}", xid);
let header = new_tar_header(&path, buf.len() as u64)?;
self.ar
.append(&header, &buf[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &buf[..]).await?;
Ok(())
}
@@ -597,28 +521,24 @@ where
// Add generated pg_control file and bootstrap WAL segment.
// Also send zenith.signal file with extra bootstrap data.
//
async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
if self.lsn == self.timeline.get_ancestor_lsn() {
write!(zenith_signal, "PREV LSN: none")
.map_err(|e| BasebackupError::Server(e.into()))?;
write!(zenith_signal, "PREV LSN: none")?;
} else {
write!(zenith_signal, "PREV LSN: invalid")
.map_err(|e| BasebackupError::Server(e.into()))?;
write!(zenith_signal, "PREV LSN: invalid")?;
}
} else {
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
.map_err(|e| BasebackupError::Server(e.into()))?;
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
}
self.ar
.append(
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
zenith_signal.as_bytes(),
)
.await
.map_err(BasebackupError::Client)?;
.await?;
let checkpoint_bytes = self
.timeline
@@ -640,10 +560,7 @@ where
//send pg_control
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
self.ar
.append(&header, &pg_control_bytes[..])
.await
.map_err(BasebackupError::Client)?;
self.ar.append(&header, &pg_control_bytes[..]).await?;
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -658,16 +575,8 @@ where
self.lsn,
)
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
if wal_seg.len() != WAL_SEGMENT_SIZE {
return Err(BasebackupError::Server(anyhow!(
"wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
wal_seg.len()
)));
}
self.ar
.append(&header, &wal_seg[..])
.await
.map_err(BasebackupError::Client)?;
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
self.ar.append(&header, &wal_seg[..]).await?;
Ok(())
}
}

View File

@@ -1918,14 +1918,12 @@ async fn timeline_collect_keyspace(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let (dense_ks, sparse_ks) = timeline
let keys = timeline
.collect_keyspace(at_lsn, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
// This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
// Therefore, we split dense/sparse keys in this API.
let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
json_response(StatusCode::OK, res)
}

View File

@@ -48,7 +48,6 @@ use utils::{
use crate::auth::check_permission;
use crate::basebackup;
use crate::basebackup::BasebackupError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
@@ -1237,13 +1236,6 @@ impl PageServerHandler {
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
fn map_basebackup_error(err: BasebackupError) -> QueryError {
match err {
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Server(e) => QueryError::Other(e),
}
}
let started = std::time::Instant::now();
// check that the timeline exists
@@ -1269,8 +1261,7 @@ impl PageServerHandler {
let lsn_awaited_after = started.elapsed();
// switch client to COPYOUT
pgb.write_message_noflush(&BeMessage::CopyOutResponse)
.map_err(QueryError::Disconnected)?;
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
// Send a tarball of the latest layer on the timeline. Compress if not
@@ -1285,8 +1276,7 @@ impl PageServerHandler {
full_backup,
ctx,
)
.await
.map_err(map_basebackup_error)?;
.await?;
} else {
let mut writer = pgb.copyout_writer();
if gzip {
@@ -1307,13 +1297,9 @@ impl PageServerHandler {
full_backup,
ctx,
)
.await
.map_err(map_basebackup_error)?;
.await?;
// shutdown the encoder to ensure the gzip footer is written
encoder
.shutdown()
.await
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
encoder.shutdown().await?;
} else {
basebackup::send_basebackup_tarball(
&mut writer,
@@ -1323,13 +1309,11 @@ impl PageServerHandler {
full_backup,
ctx,
)
.await
.map_err(map_basebackup_error)?;
.await?;
}
}
pgb.write_message_noflush(&BeMessage::CopyDone)
.map_err(QueryError::Disconnected)?;
pgb.write_message_noflush(&BeMessage::CopyDone)?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
let basebackup_after = started

View File

@@ -23,7 +23,6 @@ use pageserver_api::key::{
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -731,13 +730,11 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
///
/// The return value is (dense keyspace, sparse keyspace).
pub(crate) async fn collect_keyspace(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
) -> Result<KeySpace, CollectKeySpaceError> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -809,12 +806,7 @@ impl Timeline {
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
result.add_key(AUX_FILES_KEY);
}
Ok((
result.to_keyspace(),
/* AUX sparse key space */
SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
))
Ok(result.to_keyspace())
}
/// Get cached size of relation if it not updated after specified LSN

View File

@@ -3873,7 +3873,6 @@ mod tests {
use hex_literal::hex;
use pageserver_api::key::NON_INHERITED_RANGE;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::CompactionAlgorithm;
use rand::{thread_rng, Rng};
use tests::storage_layer::ValuesReconstructState;
use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4513,23 +4512,11 @@ mod tests {
}
async fn bulk_insert_compact_gc(
timeline: Arc<Timeline>,
ctx: &RequestContext,
lsn: Lsn,
repeat: usize,
key_count: usize,
) -> anyhow::Result<()> {
let compact = true;
bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
}
async fn bulk_insert_maybe_compact_gc(
timeline: Arc<Timeline>,
ctx: &RequestContext,
mut lsn: Lsn,
repeat: usize,
key_count: usize,
compact: bool,
) -> anyhow::Result<()> {
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut blknum = 0;
@@ -4570,11 +4557,9 @@ mod tests {
)
.await?;
timeline.freeze_and_flush().await?;
if compact {
timeline
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
.await?;
}
timeline
.compact(&CancellationToken::new(), EnumSet::empty(), ctx)
.await?;
timeline.gc().await?;
}
@@ -5057,22 +5042,7 @@ mod tests {
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
let names_algorithms = [
("test_random_updates_legacy", CompactionAlgorithm::Legacy),
("test_random_updates_tiered", CompactionAlgorithm::Tiered),
];
for (name, algorithm) in names_algorithms {
test_random_updates_algorithm(name, algorithm).await?;
}
Ok(())
}
async fn test_random_updates_algorithm(
name: &'static str,
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
let harness = TenantHarness::create("test_random_updates")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5137,7 +5107,7 @@ mod tests {
);
}
// Perform a cycle of flush, and GC
// Perform a cycle of flush, compact, and GC
let cutoff = tline.get_last_record_lsn();
tline
.update_gc_info(
@@ -5149,6 +5119,9 @@ mod tests {
)
.await?;
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.gc().await?;
}
@@ -5429,36 +5402,19 @@ mod tests {
#[tokio::test]
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
let names_algorithms = [
("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
];
for (name, algorithm) in names_algorithms {
test_read_at_max_lsn_algorithm(name, algorithm).await?;
}
Ok(())
}
async fn test_read_at_max_lsn_algorithm(
name: &'static str,
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
harness.tenant_conf.compaction_algorithm = compaction_algorithm;
let harness = TenantHarness::create("test_read_at_max_lsn")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
let lsn = Lsn(0x10);
let compact = false;
bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let read_lsn = Lsn(u64::MAX - 1);
let result = tline.get(test_key, read_lsn, &ctx).await;
assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
Ok(())
}

View File

@@ -916,7 +916,6 @@ mod tests {
assert_eq!(lhs, rhs);
}
#[cfg(test)]
fn brute_force_range_search(
layer_map: &LayerMap,
key_range: Range<Key>,

View File

@@ -2,7 +2,6 @@
//! page server.
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::LocationConfigMode;
@@ -254,15 +253,17 @@ impl TenantsMap {
}
}
/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
/// the slower actual deletion in the background.
///
/// This is "safe" in that that it won't leave behind a partially deleted directory
/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
/// the contents.
///
/// This is pageserver-specific, as it relies on future processes after a crash to check
/// for TEMP_FILE_SUFFIX when loading things.
async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
let tmp_path = safe_rename_tenant_dir(path).await?;
fs::remove_dir_all(tmp_path).await
}
async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
let parent = path
.as_ref()
@@ -285,28 +286,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
Ok(tmp_path)
}
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
/// the background, and thereby avoid blocking any API requests on this deletion completing.
fn spawn_background_purge(tmp_path: Utf8PathBuf) {
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
}
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
@@ -591,11 +570,7 @@ pub async fn init_tenant_mgr(
);
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
// Accumulate futures for writing tenant configs, so that we can execute in parallel
let mut config_write_futs = Vec::new();
// Update the location configs according to the re-attach response and persist them to disk
tracing::info!("Updating {} location configs", tenant_configs.len());
// Construct `Tenant` objects and start them running
for (tenant_shard_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -622,22 +597,18 @@ pub async fn init_tenant_mgr(
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
SecondaryLocationConfig { warm: true };
// Update the location config according to the re-attach response
if let Some(tenant_modes) = &tenant_modes {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
match tenant_modes.get(&tenant_shard_id) {
None => {
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
match safe_rename_tenant_dir(&tenant_dir_path).await {
Ok(tmp_path) => {
spawn_background_purge(tmp_path);
}
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
}
};
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
);
}
// We deleted local content: move on to next tenant, don't try and spawn this one.
continue;
@@ -683,32 +654,8 @@ pub async fn init_tenant_mgr(
// Presence of a generation number implies attachment: attach the tenant
// if it wasn't already, and apply the generation number.
config_write_futs.push(async move {
let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
(tenant_shard_id, location_conf, r)
});
}
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
// Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
tracing::info!(
"Writing {} location config files...",
config_write_futs.len()
);
let config_write_results = futures::stream::iter(config_write_futs)
.buffer_unordered(16)
.collect::<Vec<_>>()
.await;
tracing::info!(
"Spawning {} tenant shard locations...",
config_write_results.len()
);
// For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
// Errors writing configs are fatal
config_write_result?;
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
let shard_identity = location_conf.shard;
let slot = match location_conf.mode {
LocationMode::Attached(attached_conf) => {
@@ -1752,7 +1699,7 @@ impl TenantManager {
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
spawn_background_purge(tmp_path);
self.spawn_background_purge(tmp_path);
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
"failpoint"
@@ -1907,6 +1854,28 @@ impl TenantManager {
shutdown_all_tenants0(self.tenants).await
}
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
/// the background, and thereby avoid blocking any API requests on this deletion completing.
fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
}
pub(crate) async fn detach_tenant(
&self,
conf: &'static PageServerConf,
@@ -1923,7 +1892,7 @@ impl TenantManager {
deletion_queue_client,
)
.await?;
spawn_background_purge(tmp_path);
self.spawn_background_purge(tmp_path);
Ok(())
}

View File

@@ -597,17 +597,14 @@ impl InMemoryLayer {
}
}
/// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
/// layer will only contain the key range the user specifies, and may return `None`
/// if there are no matching keys.
/// Write this frozen in-memory layer to disk.
///
/// Returns a new delta layer with all the same data as this in-memory layer
pub(crate) async fn write_to_disk(
&self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
key_range: Option<Range<Key>>,
) -> Result<Option<ResidentLayer>> {
) -> Result<ResidentLayer> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception
@@ -621,21 +618,6 @@ impl InMemoryLayer {
let end_lsn = *self.end_lsn.get().unwrap();
let keys: Vec<_> = if let Some(key_range) = key_range {
inner
.index
.iter()
.filter(|(k, _)| key_range.contains(k))
.map(|(k, m)| (k.to_i128(), m))
.collect()
} else {
inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
};
if keys.is_empty() {
return Ok(None);
}
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
@@ -667,6 +649,6 @@ impl InMemoryLayer {
// MAX is used here because we identify L0 layers by full key range
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
Ok(Some(delta_layer))
Ok(delta_layer)
}
}

View File

@@ -401,8 +401,8 @@ impl Layer {
&self.0.path
}
pub(crate) fn debug_str(&self) -> &Arc<str> {
&self.0.debug_str
pub(crate) fn local_path_str(&self) -> &Arc<str> {
&self.0.path_str
}
pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -527,8 +527,8 @@ struct LayerInner {
/// Full path to the file; unclear if this should exist anymore.
path: Utf8PathBuf,
/// String representation of the layer, used for traversal id.
debug_str: Arc<str>,
/// String representation of the full path, used for traversal id.
path_str: Arc<str>,
desc: PersistentLayerDesc,
@@ -735,7 +735,7 @@ impl LayerInner {
LayerInner {
conf,
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
path_str: path.to_string().into(),
path,
desc,
timeline: Arc::downgrade(timeline),

View File

@@ -17,13 +17,13 @@ use fail::fail_point;
use once_cell::sync::Lazy;
use pageserver_api::{
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
keyspace::{KeySpaceAccum, ShardedRange},
models::{
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
},
reltag::BlockNumber,
shard::{ShardIdentity, ShardNumber, TenantShardId},
shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId},
};
use rand::Rng;
use serde_with::serde_as;
@@ -55,6 +55,7 @@ use std::{
ops::ControlFlow,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
@@ -65,7 +66,6 @@ use crate::{
disk_usage_eviction_task::DiskUsageEvictionInfo,
pgdatadir_mapping::CollectKeySpaceError,
};
use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
use crate::{
disk_usage_eviction_task::finite_f32,
tenant::storage_layer::{
@@ -86,7 +86,7 @@ use crate::{
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
};
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use crate::tenant::config::TenantConfOpt;
@@ -137,25 +137,6 @@ pub(super) enum FlushLoopState {
Exited,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum ImageLayerCreationMode {
/// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
Try,
/// Force creating the image layers if possible. For now, no image layers will be created
/// for metadata keys. Used in compaction code path with force flag enabled.
Force,
/// Initial ingestion of the data, and no data should be dropped in this function. This
/// means that no metadata keys should be included in the partitions. Used in flush frozen layer
/// code path.
Initial,
}
impl std::fmt::Display for ImageLayerCreationMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Hole {
@@ -336,7 +317,7 @@ pub struct Timeline {
pub initdb_lsn: Lsn,
/// When did we last calculate the partitioning?
partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
@@ -1253,12 +1234,6 @@ impl Timeline {
self.last_record_lsn.load()
}
/// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
/// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
self.last_record_lsn.status_receiver()
}
pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
self.disk_consistent_lsn.load()
}
@@ -2129,10 +2104,7 @@ impl Timeline {
// initial logical size is 0.
LogicalSize::empty_initial()
},
partitioning: tokio::sync::Mutex::new((
(KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
Lsn(0),
)),
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
repartition_threshold: 0,
last_image_layer_creation_check_at: AtomicLsn::new(0),
@@ -2948,7 +2920,7 @@ trait TraversalLayerExt {
impl TraversalLayerExt for Layer {
fn traversal_id(&self) -> TraversalId {
Arc::clone(self.debug_str())
Arc::clone(self.local_path_str())
}
}
@@ -3134,6 +3106,7 @@ impl Timeline {
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
let layer = guard.get_from_desc(&layer);
drop(guard);
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3254,7 +3227,7 @@ impl Timeline {
Ok(())
}
/// Collect the reconstruct data for a keyspace from the specified timeline.
/// Collect the reconstruct data for a ketspace from the specified timeline.
///
/// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
/// the current keyspace. The current keyspace of the search at any given timeline
@@ -3683,103 +3656,66 @@ impl Timeline {
// files instead. This is possible as long as *all* the data imported into the
// repository have the same LSN.
let lsn_range = frozen_layer.get_lsn_range();
// Whether to directly create image layers for this flush, or flush them as delta layers
let create_image_layer =
lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
#[cfg(test)]
{
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
expect_initdb_optimization,
initdb_optimization_count,
..
} => {
if create_image_layer {
let (layers_to_upload, delta_layer_to_add) =
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
initdb_optimization_count,
..
} => {
*initdb_optimization_count += 1;
} else {
}
}
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let (partitioning, _lsn) = self
.repartition(
self.initdb_lsn,
self.get_compaction_target_size(),
EnumSet::empty(),
ctx,
)
.await?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
// For image layers, we add them immediately into the layer map.
(
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
.await?,
None,
)
} else {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
expect_initdb_optimization,
..
} => {
assert!(!*expect_initdb_optimization, "expected initdb optimization");
}
}
}
}
let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let ((rel_partition, metadata_partition), _lsn) = self
.repartition(
self.initdb_lsn,
self.get_compaction_target_size(),
EnumSet::empty(),
ctx,
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
(
// FIXME: even though we have a single image and single delta layer assumption
// we push them to vec
vec![layer.clone()],
Some(layer),
)
.await?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
// For metadata, always create delta layers.
let delta_layer = if !metadata_partition.parts.is_empty() {
assert_eq!(
metadata_partition.parts.len(),
1,
"currently sparse keyspace should only contain a single aux file keyspace"
);
let metadata_keyspace = &metadata_partition.parts[0];
assert_eq!(
metadata_keyspace.0.ranges.len(),
1,
"aux file keyspace should be a single range"
);
self.create_delta_layer(
&frozen_layer,
ctx,
Some(metadata_keyspace.0.ranges[0].clone()),
)
.await?
} else {
None
};
// For image layers, we add them immediately into the layer map.
let mut layers_to_upload = Vec::new();
layers_to_upload.extend(
self.create_image_layers(
&rel_partition,
self.initdb_lsn,
ImageLayerCreationMode::Initial,
ctx,
)
.await?,
);
if let Some(delta_layer) = delta_layer {
layers_to_upload.push(delta_layer.clone());
(layers_to_upload, Some(delta_layer))
} else {
(layers_to_upload, None)
}
} else {
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
panic!("delta layer cannot be empty if no filter is applied");
};
(
// FIXME: even though we have a single image and single delta layer assumption
// we push them to vec
vec![layer.clone()],
Some(layer),
)
};
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
if self.cancel.is_cancelled() {
@@ -3899,18 +3835,12 @@ impl Timeline {
self: &Arc<Self>,
frozen_layer: &Arc<InMemoryLayer>,
ctx: &RequestContext,
key_range: Option<Range<Key>>,
) -> anyhow::Result<Option<ResidentLayer>> {
) -> anyhow::Result<ResidentLayer> {
let self_clone = Arc::clone(self);
let frozen_layer = Arc::clone(frozen_layer);
let ctx = ctx.attached_child();
let work = async move {
let Some(new_delta) = frozen_layer
.write_to_disk(&self_clone, &ctx, key_range)
.await?
else {
return Ok(None);
};
let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
// The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
// We just need to fsync the directory in which these inodes are linked,
// which we know to be the timeline directory.
@@ -3929,7 +3859,7 @@ impl Timeline {
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
anyhow::Ok(Some(new_delta))
anyhow::Ok(new_delta)
};
// Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
// Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3956,20 +3886,19 @@ impl Timeline {
partition_size: u64,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
// and hence before the compaction task starts.
anyhow::bail!("repartition() called concurrently, this should not happen");
};
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
if lsn < *partition_lsn {
if lsn < partitioning_guard.1 {
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
}
let distance = lsn.0 - partition_lsn.0;
if *partition_lsn != Lsn(0)
let distance = lsn.0 - partitioning_guard.1 .0;
if partitioning_guard.1 != Lsn(0)
&& distance <= self.repartition_threshold
&& !flags.contains(CompactFlags::ForceRepartition)
{
@@ -3978,18 +3907,13 @@ impl Timeline {
threshold = self.repartition_threshold,
"no repartitioning needed"
);
return Ok((
(dense_partition.clone(), sparse_partition.clone()),
*partition_lsn,
));
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
}
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
}; // no partitioning for metadata keys for now
*partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
let keyspace = self.collect_keyspace(lsn, ctx).await?;
let partitioning = keyspace.partition(&self.shard_identity, partition_size);
*partitioning_guard = (partitioning, lsn);
Ok((partitioning_guard.0.clone(), partitioning_guard.1))
}
@@ -4045,12 +3969,12 @@ impl Timeline {
false
}
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
#[tracing::instrument(skip_all, fields(%lsn, %force))]
async fn create_image_layers(
self: &Arc<Timeline>,
partitioning: &KeyPartitioning,
lsn: Lsn,
mode: ImageLayerCreationMode,
force: bool,
ctx: &RequestContext,
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
let timer = self.metrics.create_images_time_histo.start_timer();
@@ -4087,26 +4011,19 @@ impl Timeline {
for partition in partitioning.parts.iter() {
let img_range = start..partition.ranges.last().unwrap().end;
if partition.overlaps(&Key::metadata_key_range()) {
// TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
// rather big change. Keep this patch small for now.
match mode {
ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
// skip image layer creation anyways for metadata keys.
start = img_range.end;
continue;
}
ImageLayerCreationMode::Initial => {
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
}
}
} else if let ImageLayerCreationMode::Try = mode {
// check_for_image_layers = false -> skip
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
start = img_range.end;
continue;
}
let do_it = if force {
true
} else if check_for_image_layers {
// [`Self::time_for_new_image_layer`] is CPU expensive,
// so skip if we've not collected enough WAL since the last time
self.time_for_new_image_layer(partition, lsn).await
} else {
false
};
if !do_it {
start = img_range.end;
continue;
}
let mut image_layer_writer = ImageLayerWriter::new(
@@ -4601,6 +4518,28 @@ impl Timeline {
'outer: for l in layers.iter_historic_layers() {
result.layers_total += 1;
// 0. Is this layer a relic from a shard split?
// (Do this check first because irrespective of later logic regarding LSNs, this
// layer should be dropped.)
if self.shard_identity.count >= ShardCount::new(2) {
// We are a sharded tenant
let layer = guard.get_from_desc(&l);
if layer.metadata().shard != self.tenant_shard_id.to_index() {
// This is an ancestral layer
let sharded_range = ShardedRange::new(l.get_key_range(), &self.shard_identity);
if sharded_range.page_count() == 0 {
// This ancestral layer only covers keys that belong to other shards
info!(
"garbate collecting layer {} ({:?}) after shard split",
l.filename(),
l.get_key_range()
);
layers_to_remove.push(l);
continue;
}
}
}
// 1. Is it newer than GC horizon cutoff point?
if l.get_lsn_range().end > horizon_cutoff {
debug!(

View File

@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
use anyhow::{anyhow, Context};
use enumset::EnumSet;
@@ -102,7 +102,7 @@ impl Timeline {
)
.await
{
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
Ok((partitioning, lsn)) => {
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
let image_ctx = RequestContextBuilder::extend(ctx)
.access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,37 +115,17 @@ impl Timeline {
// 3. Create new image layers for partitions that have been modified
// "enough".
let dense_layers = self
let layers = self
.create_image_layers(
&dense_partitioning,
&partitioning,
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
flags.contains(CompactFlags::ForceImageLayerCreation),
&image_ctx,
)
.await
.map_err(anyhow::Error::from)?;
// For now, nothing will be produced...
let sparse_layers = self
.create_image_layers(
&sparse_partitioning.clone().into_dense(),
lsn,
if flags.contains(CompactFlags::ForceImageLayerCreation) {
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
&image_ctx,
)
.await
.map_err(anyhow::Error::from)?;
assert!(sparse_layers.is_empty());
self.upload_new_image_layers(dense_layers)?;
self.upload_new_image_layers(layers)?;
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -778,9 +758,8 @@ impl Timeline {
return Err(CompactionError::ShuttingDown);
}
let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
// TODO(chi): ignore sparse_keyspace for now, compact it in the future.
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
pageserver_compaction::compact_tiered::compact_tiered(
&mut adaptor,

View File

@@ -22,12 +22,10 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use pageserver_api::models::TimelineState;
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::proto::{
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
SubscribeByFilterRequest, TypeSubscription, TypedMessage,
};
use storage_broker::{BrokerClientChannel, Code, Streaming};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -91,14 +89,6 @@ pub(super) async fn connection_manager_loop_step(
.timeline
.subscribe_for_state_updates();
let mut wait_lsn_status = connection_manager_state
.timeline
.subscribe_for_wait_lsn_updates();
// TODO: create a separate config option for discovery request interval
let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
let mut last_discovery_ts: Option<std::time::Instant> = None;
// Subscribe to the broker updates. Stream shares underlying TCP connection
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
@@ -107,12 +97,10 @@ pub(super) async fn connection_manager_loop_step(
loop {
let time_until_next_retry = connection_manager_state.time_until_next_retry();
let any_activity = connection_manager_state.wal_connection.is_some()
|| !connection_manager_state.wal_stream_candidates.is_empty();
// These things are happening concurrently:
//
// - cancellation request
// - cancellation request
// - keep receiving WAL on the current connection
// - if the shared state says we need to change connection, disconnect and return
// - this runs in a separate task and we receive updates via a watch channel
@@ -120,7 +108,6 @@ pub(super) async fn connection_manager_loop_step(
// - receive updates from broker
// - this might change the current desired connection
// - timeline state changes to something that does not allow walreceiver to run concurrently
// - if there's no connection and no candidates, try to send a discovery request
// NB: make sure each of the select expressions are cancellation-safe
// (no need for arms to be cancellation-safe).
@@ -227,65 +214,6 @@ pub(super) async fn connection_manager_loop_step(
}
}
} => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
Some(()) = async {
// Reminder: this match arm needs to be cancellation-safe.
// Calculating time needed to wait until sending the next discovery request.
// Current implementation is conservative and sends discovery requests only when there are no candidates.
if any_activity {
// No need to send discovery requests if there is an active connection or candidates.
return None;
}
// Waiting for an active wait_lsn request.
while wait_lsn_status.borrow().is_none() {
if wait_lsn_status.changed().await.is_err() {
// wait_lsn_status channel was closed, exiting
warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
return None;
}
}
// All preconditions met, preparing to send a discovery request.
let now = std::time::Instant::now();
let next_discovery_ts = last_discovery_ts
.map(|ts| ts + discovery_request_interval)
.unwrap_or_else(|| now);
if next_discovery_ts > now {
// Prevent sending discovery requests too frequently.
tokio::time::sleep(next_discovery_ts - now).await;
}
let tenant_timeline_id = Some(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
});
let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
let msg = TypedMessage {
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
safekeeper_timeline_info: None,
safekeeper_discovery_request: Some(request),
safekeeper_discovery_response: None,
};
last_discovery_ts = Some(std::time::Instant::now());
debug!("No active connection and no candidates, sending discovery request to the broker");
// Cancellation safety: we want to send a message to the broker, but publish_one()
// function can get cancelled by the other select! arm. This is absolutely fine, because
// we just want to receive broker updates and discovery is not important if we already
// receive updates.
//
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
// This is totally fine because of the reason above.
// This is a fire-and-forget request, we don't care about the response
let _ = broker_client.publish_one(msg).await;
debug!("Discovery request sent to the broker");
None
} => {}
}
if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -303,7 +231,7 @@ async fn subscribe_for_timeline_updates(
broker_client: &mut BrokerClientChannel,
id: TenantTimelineId,
cancel: &CancellationToken,
) -> Result<Streaming<TypedMessage>, Cancelled> {
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
let mut attempt = 0;
loop {
exponential_backoff(
@@ -316,27 +244,17 @@ async fn subscribe_for_timeline_updates(
attempt += 1;
// subscribe to the specific timeline
let request = SubscribeByFilterRequest {
types: vec![
TypeSubscription {
r#type: MessageType::SafekeeperTimelineInfo as i32,
},
TypeSubscription {
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
},
],
tenant_timeline_id: Some(FilterTenantTimelineId {
enabled: true,
tenant_timeline_id: Some(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
}),
}),
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
});
let request = SubscribeSafekeeperInfoRequest {
subscription_key: Some(key),
};
match {
tokio::select! {
r = broker_client.subscribe_by_filter(request) => { r }
r = broker_client.subscribe_safekeeper_info(request) => { r }
_ = cancel.cancelled() => { return Err(Cancelled); }
}
} {
@@ -480,7 +398,7 @@ struct RetryInfo {
/// Data about the timeline to connect to, received from the broker.
#[derive(Debug, Clone)]
struct BrokerSkTimeline {
timeline: SafekeeperDiscoveryResponse,
timeline: SafekeeperTimelineInfo,
/// Time at which the data was fetched from the broker last time, to track the stale data.
latest_update: NaiveDateTime,
}
@@ -688,41 +606,7 @@ impl ConnectionManagerState {
}
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
let mut is_discovery = false;
let timeline_update = match typed_msg.r#type() {
MessageType::SafekeeperTimelineInfo => {
let info = match typed_msg.safekeeper_timeline_info {
Some(info) => info,
None => {
warn!("bad proto message from broker: no safekeeper_timeline_info");
return;
}
};
SafekeeperDiscoveryResponse {
safekeeper_id: info.safekeeper_id,
tenant_timeline_id: info.tenant_timeline_id,
commit_lsn: info.commit_lsn,
safekeeper_connstr: info.safekeeper_connstr,
availability_zone: info.availability_zone,
}
}
MessageType::SafekeeperDiscoveryResponse => {
is_discovery = true;
match typed_msg.safekeeper_discovery_response {
Some(response) => response,
None => {
warn!("bad proto message from broker: no safekeeper_discovery_response");
return;
}
}
}
_ => {
// unexpected message
return;
}
};
fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
WALRECEIVER_BROKER_UPDATES.inc();
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -735,11 +619,7 @@ impl ConnectionManagerState {
);
if old_entry.is_none() {
info!(
?is_discovery,
%new_safekeeper_id,
"New SK node was added",
);
info!("New SK node was added: {new_safekeeper_id}");
WALRECEIVER_CANDIDATES_ADDED.inc();
}
}
@@ -938,7 +818,7 @@ impl ConnectionManagerState {
fn select_connection_candidate(
&self,
node_to_omit: Option<NodeId>,
) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
self.applicable_connection_candidates()
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
.max_by_key(|(_, info, _)| info.commit_lsn)
@@ -948,7 +828,7 @@ impl ConnectionManagerState {
/// Some safekeepers are filtered by the retry cooldown.
fn applicable_connection_candidates(
&self,
) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
let now = Utc::now().naive_utc();
self.wal_stream_candidates
@@ -1088,11 +968,19 @@ mod tests {
latest_update: NaiveDateTime,
) -> BrokerSkTimeline {
BrokerSkTimeline {
timeline: SafekeeperDiscoveryResponse {
timeline: SafekeeperTimelineInfo {
safekeeper_id: 0,
tenant_timeline_id: None,
term: 0,
last_log_term: 0,
flush_lsn: 0,
commit_lsn,
backup_lsn: 0,
remote_consistent_lsn: 0,
peer_horizon_lsn: 0,
local_start_lsn: 0,
safekeeper_connstr: safekeeper_connstr.to_owned(),
http_connstr: safekeeper_connstr.to_owned(),
availability_zone: None,
},
latest_update,

View File

@@ -22,12 +22,7 @@ serde_with.workspace = true
workspace_hack.workspace = true
utils.workspace = true
async-stream.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres_ffi.workspace = true
tokio-stream.workspace = true
tokio-postgres.workspace = true
tokio-util = { workspace = true }
futures-util.workspace = true
itertools.workspace = true
camino.workspace = true

View File

@@ -67,12 +67,10 @@ the purge command will log all the keys that it would have deleted.
#### `scan-metadata`
Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
Errors are logged to stderr and summary to stdout.
Walk objects in a pageserver S3 bucket, and report statistics on the contents.
For pageserver:
```
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
Timelines: 31106
With errors: 3
@@ -84,10 +82,6 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
```
For safekeepers, dump_db_connstr and dump_db_table must be
specified; they should point to table with debug dump which will be used
to list timelines and find their backup and start LSNs.
## Cleaning up running pageservers
If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.

View File

@@ -1,13 +1,11 @@
use chrono::{DateTime, Utc};
use futures::Future;
use hex::FromHex;
use std::time::Duration;
use chrono::{DateTime, Utc};
use hex::FromHex;
use reqwest::{header, Client, StatusCode, Url};
use serde::Deserialize;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use utils::backoff;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -139,7 +137,7 @@ pub struct ProjectData {
pub region_id: String,
pub platform_id: String,
pub user_id: String,
pub pageserver_id: Option<u64>,
pub pageserver_id: u64,
#[serde(deserialize_with = "from_nullable_id")]
pub tenant: TenantId,
pub safekeepers: Vec<SafekeeperData>,
@@ -157,7 +155,7 @@ pub struct ProjectData {
pub maintenance_set: Option<String>,
}
#[derive(Debug, Clone, serde::Deserialize)]
#[derive(Debug, serde::Deserialize)]
pub struct BranchData {
pub id: BranchId,
pub created_at: DateTime<Utc>,
@@ -212,39 +210,30 @@ impl CloudAdminApiClient {
.await
.expect("Semaphore is not closed");
let response = CloudAdminApiClient::with_retries(
|| async {
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("tenant_id", tenant_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response: AdminApiResponse<Vec<ProjectData>> =
response.json().await.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::BodyRead(e),
)
})?;
Ok(response)
},
"find_tenant_project",
)
.await?;
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("tenant_id", tenant_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
Error::new(
"Find project for tenant".to_string(),
ErrorKind::BodyRead(e),
)
})?;
match response.data.len() {
0 => Ok(None),
1 => Ok(Some(
@@ -272,34 +261,42 @@ impl CloudAdminApiClient {
const PAGINATION_LIMIT: usize = 512;
let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
loop {
let response_bytes = CloudAdminApiClient::with_retries(
|| async {
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("show_deleted", "false".to_string()),
("limit", format!("{PAGINATION_LIMIT}")),
("offset", format!("{pagination_offset}")),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"List active projects".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response = self
.http_client
.get(self.append_url("/projects"))
.query(&[
("show_deleted", "false".to_string()),
("limit", format!("{PAGINATION_LIMIT}")),
("offset", format!("{pagination_offset}")),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"List active projects".to_string(),
ErrorKind::RequestSend(e),
)
})?;
response.bytes().await.map_err(|e| {
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
})
},
"list_projects",
)
.await?;
match response.status() {
StatusCode::OK => {}
StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
tokio::time::sleep(Duration::from_millis(500)).await;
continue;
}
_status => {
return Err(Error::new(
"List active projects".to_string(),
ErrorKind::ResponseStatus(response.status()),
))
}
}
let response_bytes = response.bytes().await.map_err(|e| {
Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
})?;
let decode_result =
serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -330,7 +327,6 @@ impl CloudAdminApiClient {
pub async fn find_timeline_branch(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Option<BranchData>, Error> {
let _permit = self
@@ -339,61 +335,43 @@ impl CloudAdminApiClient {
.await
.expect("Semaphore is not closed");
let response = CloudAdminApiClient::with_retries(
|| async {
let response = self
.http_client
.get(self.append_url("/branches"))
.query(&[
("timeline_id", timeline_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response = self
.http_client
.get(self.append_url("/branches"))
.query(&[
("timeline_id", timeline_id.to_string()),
("show_deleted", "true".to_string()),
])
.header(header::ACCEPT, "application/json")
.bearer_auth(&self.token)
.send()
.await
.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::RequestSend(e),
)
})?;
let response: AdminApiResponse<Vec<BranchData>> =
response.json().await.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::BodyRead(e),
)
})?;
Ok(response)
},
"find_timeline_branch",
)
.await?;
let mut branches: Vec<BranchData> = response.data.into_iter().collect();
// Normally timeline_id is unique. However, we do have at least one case
// of the same timeline_id in two different projects, apparently after
// manual recovery. So always recheck project_id (discovered through
// tenant_id).
let project_data = match self.find_tenant_project(tenant_id).await? {
Some(pd) => pd,
None => return Ok(None),
};
branches.retain(|b| b.project_id == project_data.id);
if branches.len() < 2 {
Ok(branches.first().cloned())
} else {
Err(Error::new(
format!(
"Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
tenant_id,
timeline_id,
branches.len()
),
let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
Error::new(
"Find branch for timeline".to_string(),
ErrorKind::BodyRead(e),
)
})?;
match response.data.len() {
0 => Ok(None),
1 => Ok(Some(
response
.data
.into_iter()
.next()
.expect("Should have exactly one element"),
)),
too_many => Err(Error::new(
format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
ErrorKind::UnexpectedState,
))
)),
}
}
@@ -554,15 +532,4 @@ impl CloudAdminApiClient {
.parse()
.unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
}
async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
where
O: FnMut() -> F,
F: Future<Output = Result<T, Error>>,
{
let cancel = CancellationToken::new(); // not really used
backoff::retry(op, |_| false, 1, 20, description, &cancel)
.await
.expect("cancellations are disabled")
}
}

View File

@@ -60,7 +60,6 @@ pub struct GarbageList {
/// see garbage, we saw some active tenants too. This protects against classes of bugs
/// in the scrubber that might otherwise generate a "deleted all" result.
active_tenant_count: usize,
active_timeline_count: usize,
}
impl GarbageList {
@@ -68,7 +67,6 @@ impl GarbageList {
Self {
items: Vec::new(),
active_tenant_count: 0,
active_timeline_count: 0,
node_kind,
bucket_config,
}
@@ -121,10 +119,7 @@ pub async fn find_garbage(
const S3_CONCURRENCY: usize = 32;
// How many concurrent API requests to make to the console API.
//
// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
// would be better to implement real rsp limiter.
const CONSOLE_CONCURRENCY: usize = 16;
const CONSOLE_CONCURRENCY: usize = 128;
struct ConsoleCache {
/// Set of tenants found in the control plane API
@@ -226,7 +221,6 @@ async fn find_garbage_inner(
} else {
tracing::debug!("Tenant {tenant_shard_id} is active");
active_tenants.push(tenant_shard_id);
garbage.active_tenant_count = active_tenants.len();
}
counter += 1;
@@ -267,7 +261,7 @@ async fn find_garbage_inner(
let api_client = cloud_admin_api_client.clone();
async move {
api_client
.find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
.find_timeline_branch(ttid.timeline_id)
.await
.map_err(|e| anyhow::anyhow!(e))
.map(|r| (ttid, r))
@@ -277,29 +271,15 @@ async fn find_garbage_inner(
std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
// Update the GarbageList with any timelines which appear not to exist.
let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
while let Some(result) = timelines_checked.next().await {
let (ttid, console_result) = result?;
if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
tracing::debug!("Timeline {ttid} is garbage");
} else {
tracing::debug!("Timeline {ttid} is active");
active_timelines.push(ttid);
garbage.active_timeline_count = active_timelines.len();
}
}
let num_garbage_timelines = garbage
.items
.iter()
.filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
.count();
tracing::info!(
"Found {}/{} garbage timelines in active tenants",
num_garbage_timelines,
active_timelines.len(),
);
Ok(garbage)
}
@@ -364,22 +344,16 @@ pub async fn get_timeline_objects(
const MAX_KEYS_PER_DELETE: usize = 1000;
/// Drain a buffer of keys into DeleteObjects requests
///
/// If `drain` is true, drains keys completely; otherwise stops when <
/// MAX_KEYS_PER_DELETE keys are left.
/// `num_deleted` returns number of deleted keys.
async fn do_delete(
s3_client: &Arc<Client>,
bucket_name: &str,
keys: &mut Vec<ObjectIdentifier>,
dry_run: bool,
drain: bool,
progress_tracker: &mut DeletionProgressTracker,
) -> anyhow::Result<()> {
while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
let request_keys =
keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
let num_deleted = request_keys.len();
if dry_run {
tracing::info!("Dry-run deletion of objects: ");
for k in request_keys {
@@ -394,30 +368,12 @@ async fn do_delete(
.send()
.await
.context("DeleteObjects request")?;
progress_tracker.register(num_deleted);
}
}
Ok(())
}
/// Simple tracker reporting each 10k deleted keys.
#[derive(Default)]
struct DeletionProgressTracker {
num_deleted: usize,
last_reported_num_deleted: usize,
}
impl DeletionProgressTracker {
fn register(&mut self, n: usize) {
self.num_deleted += n;
if self.num_deleted - self.last_reported_num_deleted > 10000 {
tracing::info!("progress: deleted {} keys", self.num_deleted);
self.last_reported_num_deleted = self.num_deleted;
}
}
}
pub async fn purge_garbage(
input_path: String,
mode: PurgeMode,
@@ -438,14 +394,6 @@ pub async fn purge_garbage(
if garbage_list.active_tenant_count == 0 {
anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
}
if garbage_list
.items
.iter()
.any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
&& garbage_list.active_timeline_count == 0
{
anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
}
let filtered_items = garbage_list
.items
@@ -481,7 +429,6 @@ pub async fn purge_garbage(
std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
let mut objects_to_delete = Vec::new();
let mut progress_tracker = DeletionProgressTracker::default();
while let Some(result) = get_objects_results.next().await {
let mut object_list = result?;
objects_to_delete.append(&mut object_list);
@@ -492,7 +439,6 @@ pub async fn purge_garbage(
&mut objects_to_delete,
dry_run,
false,
&mut progress_tracker,
)
.await?;
}
@@ -504,11 +450,10 @@ pub async fn purge_garbage(
&mut objects_to_delete,
dry_run,
true,
&mut progress_tracker,
)
.await?;
tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
tracing::info!("Fell through");
Ok(())
}

View File

@@ -4,8 +4,7 @@ pub mod checks;
pub mod cloud_admin_api;
pub mod garbage;
pub mod metadata_stream;
pub mod scan_pageserver_metadata;
pub mod scan_safekeeper_metadata;
pub mod scan_metadata;
pub mod tenant_snapshot;
use std::env;
@@ -142,17 +141,12 @@ impl RootTarget {
pub fn tenants_root(&self) -> S3Target {
match self {
Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
Self::Safekeeper(root) => root.clone(),
Self::Safekeeper(root) => root.with_sub_segment("wal"),
}
}
pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
match self {
Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
Self::Safekeeper(_) => self
.tenants_root()
.with_sub_segment(&tenant_id.tenant_id.to_string()),
}
self.tenants_root().with_sub_segment(&tenant_id.to_string())
}
pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
@@ -343,7 +337,9 @@ fn init_remote(
}),
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
prefix_in_bucket: bucket_config
.prefix_in_bucket
.unwrap_or("safekeeper/v1".to_string()),
delimiter,
}),
};
@@ -368,10 +364,7 @@ async fn list_objects_with_retries(
{
Ok(response) => return Ok(response),
Err(e) => {
error!(
"list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
);
error!("list_objects_v2 query failed: {e}");
tokio::time::sleep(Duration::from_secs(1)).await;
}
}

View File

@@ -1,13 +1,9 @@
use anyhow::bail;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use s3_scrubber::scan_pageserver_metadata::scan_metadata;
use s3_scrubber::scan_metadata::scan_metadata;
use s3_scrubber::tenant_snapshot::SnapshotDownloader;
use s3_scrubber::{
init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
NodeKind, TraversingDepth,
};
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
use clap::{Parser, Subcommand};
use utils::id::TenantId;
@@ -39,20 +35,11 @@ enum Command {
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
mode: PurgeMode,
},
#[command(verbatim_doc_comment)]
ScanMetadata {
#[arg(short, long)]
node_kind: NodeKind,
#[arg(short, long, default_value_t = false)]
json: bool,
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
#[arg(long, default_value = None)]
/// For safekeeper node_kind only, points to db with debug dump
dump_db_connstr: Option<String>,
/// For safekeeper node_kind only, table in the db with debug dump
#[arg(long, default_value = None)]
dump_db_table: Option<String>,
},
TenantSnapshot {
#[arg(long = "tenant-id")]
@@ -85,75 +72,33 @@ async fn main() -> anyhow::Result<()> {
));
match cli.command {
Command::ScanMetadata {
json,
tenant_ids,
node_kind,
dump_db_connstr,
dump_db_table,
} => {
if let NodeKind::Safekeeper = node_kind {
let dump_db_connstr =
dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
let dump_db_table =
dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
let summary = scan_safekeeper_metadata(
bucket_config.clone(),
tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
dump_db_connstr,
dump_db_table,
)
.await?;
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
Command::ScanMetadata { json, tenant_ids } => {
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
if summary.is_fatal() {
bail!("Fatal scrub errors detected");
}
if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
bail!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
);
}
Ok(())
} else {
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
}
}
}

View File

@@ -114,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>(
let timelines_target = target.timelines_root(&tenant);
loop {
tracing::debug!("Listing in {}", tenant);
tracing::info!("Listing in {}", tenant);
let fetch_response =
list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
.await;
@@ -151,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>(
}
}
tracing::debug!("Yielding for {}", tenant);
tracing::info!("Yielding for {}", tenant);
Ok(stream! {
for i in timeline_ids {
let id = i?;

View File

@@ -1,236 +0,0 @@
use std::{collections::HashSet, str::FromStr};
use aws_sdk_s3::Client;
use futures::stream::{StreamExt, TryStreamExt};
use pageserver_api::shard::TenantShardId;
use postgres_ffi::{XLogFileName, PG_TLI};
use serde::Serialize;
use tokio_postgres::types::PgLsn;
use tracing::{error, info, trace};
use utils::{
id::{TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
};
use crate::{
cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
};
/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
#[derive(Serialize)]
pub struct MetadataSummary {
timeline_count: usize,
with_errors: HashSet<TenantTimelineId>,
deleted_count: usize,
}
impl MetadataSummary {
fn new() -> Self {
Self {
timeline_count: 0,
with_errors: HashSet::new(),
deleted_count: 0,
}
}
pub fn summary_string(&self) -> String {
format!(
"timeline_count: {}, with_errors: {}",
self.timeline_count,
self.with_errors.len()
)
}
pub fn is_empty(&self) -> bool {
self.timeline_count == 0
}
pub fn is_fatal(&self) -> bool {
!self.with_errors.is_empty()
}
}
/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
/// statistics.
///
/// It works by listing timelines along with timeline_start_lsn and backup_lsn
/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
/// segments are missing, before complaining control plane is queried to check if
/// the project wasn't deleted in the meanwhile.
pub async fn scan_safekeeper_metadata(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantId>,
dump_db_connstr: String,
dump_db_table: String,
) -> anyhow::Result<MetadataSummary> {
info!(
"checking bucket {}, region {}, dump_db_table {}",
bucket_config.bucket, bucket_config.region, dump_db_table
);
// Use the native TLS implementation (Neon requires TLS)
let tls_connector =
postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
let tenant_filter_clause = if !tenant_ids.is_empty() {
format!(
"and tenant_id in ({})",
tenant_ids
.iter()
.map(|t| format!("'{}'", t))
.collect::<Vec<_>>()
.join(", ")
)
} else {
"".to_owned()
};
let query = format!(
"select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
dump_db_table, tenant_filter_clause,
);
info!("query is {}", query);
let timelines = client.query(&query, &[]).await?;
info!("loaded {} timelines", timelines.len());
let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
let console_config = ConsoleConfig::from_env()?;
let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
let timeline_start_lsn_pg: PgLsn = row.get(2);
let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
let backup_lsn_pg: PgLsn = row.get(3);
let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
check_timeline(
&s3_client,
&target,
&cloud_admin_api_client,
ttid,
timeline_start_lsn,
backup_lsn,
)
});
// Run multiple check_timeline's concurrently.
const CONCURRENCY: usize = 32;
let mut timelines = checks.try_buffered(CONCURRENCY);
let mut summary = MetadataSummary::new();
while let Some(r) = timelines.next().await {
let res = r?;
summary.timeline_count += 1;
if !res.is_ok {
summary.with_errors.insert(res.ttid);
}
if res.is_deleted {
summary.deleted_count += 1;
}
}
Ok(summary)
}
struct TimelineCheckResult {
ttid: TenantTimelineId,
is_ok: bool,
is_deleted: bool, // timeline is deleted in cplane
}
/// List s3 and check that is has all expected WAL for the ttid. Consistency
/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
/// Ok(false) if not, Err if failed to check.
async fn check_timeline(
s3_client: &Client,
root: &RootTarget,
api_client: &CloudAdminApiClient,
ttid: TenantTimelineId,
timeline_start_lsn: Lsn,
backup_lsn: Lsn,
) -> anyhow::Result<TimelineCheckResult> {
trace!(
"checking ttid {}, should contain WAL [{}-{}]",
ttid,
timeline_start_lsn,
backup_lsn
);
// calculate expected segfiles
let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
(expected_first_segno..expected_last_segno)
.map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
);
let expected_files_num = expected_segfiles.len();
trace!("expecting {} files", expected_segfiles.len(),);
// now list s3 and check if it misses something
let ttshid =
TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
let mut timeline_dir_target = root.timeline_root(&ttshid);
// stream_listing yields only common_prefixes if delimiter is not empty, but
// we need files, so unset it.
timeline_dir_target.delimiter = String::new();
let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
while let Some(obj) = stream.next().await {
let obj = obj?;
let key = obj.key();
let seg_name = key
.strip_prefix(&timeline_dir_target.prefix_in_bucket)
.expect("failed to extract segment name");
expected_segfiles.remove(seg_name);
}
if !expected_segfiles.is_empty() {
// Before complaining check cplane, probably timeline is already deleted.
let bdata = api_client
.find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
.await?;
let deleted = match bdata {
Some(bdata) => bdata.deleted,
None => {
// note: should be careful with selecting proper cplane address
info!("ttid {} not found, assuming it is deleted", ttid);
true
}
};
if deleted {
// ok, branch is deleted
return Ok(TimelineCheckResult {
ttid,
is_ok: true,
is_deleted: true,
});
}
error!(
"ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
ttid,
expected_segfiles.len(),
expected_files_num,
timeline_start_lsn,
backup_lsn,
);
return Ok(TimelineCheckResult {
ttid,
is_ok: false,
is_deleted: false,
});
}
Ok(TimelineCheckResult {
ttid,
is_ok: true,
is_deleted: false,
})
}

View File

@@ -177,10 +177,6 @@ struct Args {
/// Controls how long backup will wait until uploading the partial segment.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
partial_backup_timeout: Duration,
/// Disable task to push messages to broker every second. Supposed to
/// be used in tests.
#[arg(long)]
disable_periodic_broker_push: bool,
}
// Like PathBufValueParser, but allows empty string.
@@ -313,7 +309,6 @@ async fn main() -> anyhow::Result<()> {
walsenders_keep_horizon: args.walsenders_keep_horizon,
partial_backup_enabled: args.partial_backup_enabled,
partial_backup_timeout: args.partial_backup_timeout,
disable_periodic_broker_push: args.disable_periodic_broker_push,
};
// initialize sentry if SENTRY_DSN is provided

View File

@@ -10,20 +10,11 @@ use anyhow::Result;
use storage_broker::parse_proto_ttid;
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
use storage_broker::proto::FilterTenantTimelineId;
use storage_broker::proto::MessageType;
use storage_broker::proto::SafekeeperDiscoveryResponse;
use storage_broker::proto::SubscribeByFilterRequest;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TypeSubscription;
use storage_broker::proto::TypedMessage;
use storage_broker::Request;
use std::sync::atomic::AtomicU64;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use std::time::UNIX_EPOCH;
use tokio::task::JoinHandle;
use tokio::time::sleep;
use tracing::*;
@@ -40,12 +31,6 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;
/// Push once in a while data about all active timelines to the broker.
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
if conf.disable_periodic_broker_push {
info!("broker push_loop is disabled, doing nothing...");
futures::future::pending::<()>().await; // sleep forever
return Ok(());
}
let mut client =
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -90,7 +75,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
}
/// Subscribe and fetch all the interesting data from the broker.
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
// TODO: subscribe only to local timelines instead of all
@@ -109,8 +94,6 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);
while let Some(msg) = stream.message().await? {
stats.update_pulled();
let proto_ttid = msg
.tenant_timeline_id
.as_ref()
@@ -136,93 +119,12 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
bail!("end of stream");
}
/// Process incoming discover requests. This is done in a separate task to avoid
/// interfering with the normal pull/push loops.
async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
let mut client =
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
let request = SubscribeByFilterRequest {
types: vec![TypeSubscription {
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
}],
tenant_timeline_id: Some(FilterTenantTimelineId {
enabled: false,
tenant_timeline_id: None,
}),
};
let mut stream = client
.subscribe_by_filter(request)
.await
.context("subscribe_by_filter request failed")?
.into_inner();
let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
while let Some(typed_msg) = stream.message().await? {
stats.update_pulled();
match typed_msg.r#type() {
MessageType::SafekeeperDiscoveryRequest => {
let msg = typed_msg
.safekeeper_discovery_request
.expect("proto type mismatch from broker message");
let proto_ttid = msg
.tenant_timeline_id
.as_ref()
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
let ttid = parse_proto_ttid(proto_ttid)?;
if let Ok(tli) = GlobalTimelines::get(ttid) {
// we received a discovery request for a timeline we know about
discover_counter.inc();
// create and reply with discovery response
let sk_info = tli.get_safekeeper_info(&conf).await;
let response = SafekeeperDiscoveryResponse {
safekeeper_id: sk_info.safekeeper_id,
tenant_timeline_id: sk_info.tenant_timeline_id,
commit_lsn: sk_info.commit_lsn,
safekeeper_connstr: sk_info.safekeeper_connstr,
availability_zone: sk_info.availability_zone,
};
// note this is a blocking call
client
.publish_one(TypedMessage {
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
safekeeper_timeline_info: None,
safekeeper_discovery_request: None,
safekeeper_discovery_response: Some(response),
})
.await?;
}
}
_ => {
warn!(
"unexpected message type i32 {}, {:?}",
typed_msg.r#type,
typed_msg.r#type()
);
}
}
}
bail!("end of stream");
}
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
info!("started, broker endpoint {:?}", conf.broker_endpoint);
let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
let stats = Arc::new(BrokerStats::new());
let stats_task = task_stats(stats.clone());
tokio::pin!(stats_task);
// Selecting on JoinHandles requires some squats; is there a better way to
// reap tasks individually?
@@ -251,77 +153,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
};
pull_handle = None;
},
res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
// was it panic or normal error?
match res {
Ok(res_internal) => if let Err(err_inner) = res_internal {
warn!("discover task failed: {:?}", err_inner);
}
Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
};
discover_handle = None;
},
_ = ticker.tick() => {
if push_handle.is_none() {
push_handle = Some(tokio::spawn(push_loop(conf.clone())));
}
if pull_handle.is_none() {
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
}
if discover_handle.is_none() {
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
}
},
_ = &mut stats_task => {}
}
}
}
struct BrokerStats {
/// Timestamp of the last received message from the broker.
last_pulled_ts: AtomicU64,
}
impl BrokerStats {
fn new() -> Self {
BrokerStats {
last_pulled_ts: AtomicU64::new(0),
}
}
fn now_millis() -> u64 {
std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("time is before epoch")
.as_millis() as u64
}
/// Update last_pulled timestamp to current time.
fn update_pulled(&self) {
self.last_pulled_ts
.store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
}
}
/// Periodically write to logs if there are issues with receiving data from the broker.
async fn task_stats(stats: Arc<BrokerStats>) {
let warn_duration = Duration::from_secs(10);
let mut ticker = tokio::time::interval(warn_duration);
loop {
tokio::select! {
_ = ticker.tick() => {
let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
if last_pulled == 0 {
// no broker updates yet
continue;
}
let now = BrokerStats::now_millis();
if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
info!("no broker updates for some time, last update: {:?}", ts);
}
}
}
}

View File

@@ -83,7 +83,6 @@ pub struct SafeKeeperConf {
pub walsenders_keep_horizon: bool,
pub partial_backup_enabled: bool,
pub partial_backup_timeout: Duration,
pub disable_periodic_broker_push: bool,
}
impl SafeKeeperConf {
@@ -130,7 +129,6 @@ impl SafeKeeperConf {
walsenders_keep_horizon: false,
partial_backup_enabled: false,
partial_backup_timeout: Duration::from_secs(0),
disable_periodic_broker_push: false,
}
}
}

View File

@@ -178,7 +178,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
walsenders_keep_horizon: false,
partial_backup_enabled: false,
partial_backup_timeout: Duration::from_secs(0),
disable_periodic_broker_push: false,
};
let mut global = GlobalMap::new(disk, conf.clone())?;

View File

@@ -196,13 +196,8 @@ impl SubscriptionKey {
/// Parse from FilterTenantTimelineId
pub fn from_proto_filter_tenant_timeline_id(
opt: Option<&FilterTenantTimelineId>,
f: &FilterTenantTimelineId,
) -> Result<Self, Status> {
if opt.is_none() {
return Ok(SubscriptionKey::All);
}
let f = opt.unwrap();
if !f.enabled {
return Ok(SubscriptionKey::All);
}
@@ -539,7 +534,10 @@ impl BrokerService for Broker {
.remote_addr()
.expect("TCPConnectInfo inserted by handler");
let proto_filter = request.into_inner();
let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
let ttid_filter = proto_filter
.tenant_timeline_id
.as_ref()
.ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
let types_set = proto_filter

View File

@@ -90,11 +90,7 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
/// up on unresponsive pageservers and proceed.
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
/// How long a node may be unresponsive to heartbeats before we declare it offline.
/// This must be long enough to cover node restarts as well as normal operations: in future
/// it should be separated into distinct timeouts for startup vs. normal operation
/// (`<https://github.com/neondatabase/neon/issues/7552>`)
pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
@@ -4255,9 +4251,7 @@ impl Service {
/// Check all tenants for pending reconciliation work, and reconcile those in need.
/// Additionally, reschedule tenants that require it.
///
/// Returns how many reconciliation tasks were started, or `1` if no reconciles were
/// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
/// available. A return value of 0 indicates that everything is fully reconciled already.
/// Returns how many reconciliation tasks were started
fn reconcile_all(&self) -> usize {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, _scheduler) = locked.parts_mut();
@@ -4272,11 +4266,7 @@ impl Service {
}
// Skip checking if this shard is already enqueued for reconciliation
if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
// If there is something delayed, then return a nonzero count so that
// callers like reconcile_all_now do not incorrectly get the impression
// that the system is in a quiescent state.
reconciles_spawned = std::cmp::max(1, reconciles_spawned);
if shard.delayed_reconcile {
continue;
}
@@ -4461,7 +4451,7 @@ impl Service {
waiter_count
);
Ok(std::cmp::max(waiter_count, reconciles_spawned))
Ok(waiter_count)
}
pub async fn shutdown(&self) {

View File

@@ -952,8 +952,8 @@ impl TenantShard {
/// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
///
/// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
/// you would like to wait on the next reconciler that gets spawned in the background.
/// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
/// you would like to wait until one gets spawned in the background.
pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
self.ensure_sequence_ahead();

View File

@@ -14,18 +14,10 @@ class ComputeReconfigure:
self.server = server
self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
self.workloads = {}
self.on_notify = None
def register_workload(self, workload):
self.workloads[workload.tenant_id] = workload
def register_on_notify(self, fn):
"""
Add some extra work during a notification, like sleeping to slow things down, or
logging what was notified.
"""
self.on_notify = fn
@pytest.fixture(scope="function")
def compute_reconfigure_listener(make_httpserver):
@@ -51,9 +43,6 @@ def compute_reconfigure_listener(make_httpserver):
body: dict[str, Any] = request.json
log.info(f"notify-attach request: {body}")
if self.on_notify is not None:
self.on_notify(body)
try:
workload = self.workloads[TenantId(body["tenant_id"])]
except KeyError:

View File

@@ -499,7 +499,6 @@ class NeonEnvBuilder:
self.config_init_force: Optional[str] = None
self.top_output_dir = top_output_dir
self.control_plane_compute_hook_api: Optional[str] = None
self.storage_controller_config: Optional[dict[Any, Any]] = None
self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
@@ -1022,7 +1021,6 @@ class NeonEnv:
self.pg_distrib_dir = config.pg_distrib_dir
self.endpoint_counter = 0
self.pageserver_config_override = config.pageserver_config_override
self.storage_controller_config = config.storage_controller_config
# generate initial tenant ID here instead of letting 'neon init' generate it,
# so that we don't need to dig it out of the config file afterwards.
@@ -1068,9 +1066,6 @@ class NeonEnv:
if self.control_plane_compute_hook_api is not None:
cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
if self.storage_controller_config is not None:
cfg["storage_controller"] = self.storage_controller_config
# Create config for pageserver
http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1139,9 +1134,12 @@ class NeonEnv:
# bounce through retries on startup
self.storage_controller.start()
def storage_controller_ready():
assert self.storage_controller.ready() is True
# Wait for storage controller readiness to prevent unnecessary post start-up
# reconcile.
self.storage_controller.wait_until_ready()
wait_until(30, 1, storage_controller_ready)
# Start up broker, pageserver and all safekeepers
futs = []
@@ -2045,15 +2043,6 @@ class NeonStorageController(MetricsGetter):
else:
raise RuntimeError(f"Unexpected status {status} from readiness endpoint")
def wait_until_ready(self):
t1 = time.time()
def storage_controller_ready():
assert self.ready() is True
wait_until(30, 1, storage_controller_ready)
return time.time() - t1
def attach_hook_issue(
self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
) -> int:
@@ -2141,7 +2130,7 @@ class NeonStorageController(MetricsGetter):
shard_count: Optional[int] = None,
shard_stripe_size: Optional[int] = None,
tenant_config: Optional[Dict[Any, Any]] = None,
placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
placement_policy: Optional[str] = None,
):
"""
Use this rather than pageserver_api() when you need to include shard parameters
@@ -2251,21 +2240,10 @@ class NeonStorageController(MetricsGetter):
def reconcile_until_idle(self, timeout_secs=30):
start_at = time.time()
n = 1
delay_sec = 0.5
delay_max = 5
while n > 0:
n = self.reconcile_all()
if n == 0:
break
elif time.time() - start_at > timeout_secs:
if time.time() - start_at > timeout_secs:
raise RuntimeError("Timeout in reconcile_until_idle")
else:
# Don't call again right away: if we're waiting for many reconciles that
# are blocked on the concurrency limit, it slows things down to call
# reconcile_all frequently.
time.sleep(delay_sec)
delay_sec *= 2
delay_sec = min(delay_sec, delay_max)
def consistency_check(self):
"""
@@ -3756,9 +3734,7 @@ class S3Scrubber:
return stdout
def scan_metadata(self) -> Any:
stdout = self.scrubber_cli(
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
)
stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
try:
return json.loads(stdout)

View File

@@ -1,198 +0,0 @@
import concurrent.futures
import random
import time
import pytest
from fixtures.compute_reconfigure import ComputeReconfigure
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pg_version import PgVersion
from fixtures.types import TenantId, TenantShardId, TimelineId
@pytest.mark.timeout(3600) # super long running test: should go down as we optimize
def test_storage_controller_many_tenants(
neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
):
"""
Check that we cope well with a not-totally-trivial number of tenants.
This is checking for:
- Obvious concurrency bugs from issuing many tenant creations/modifications
concurrently.
- Obvious scaling bugs like O(N^2) scaling that would be so slow that even
a basic test starts failing from slowness.
This is _not_ a comprehensive scale test: just a basic sanity check that
we don't fall over for a thousand shards.
"""
neon_env_builder.num_pageservers = 5
neon_env_builder.storage_controller_config = {
# Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
# TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
# guard against regressions in restart time.
"max_unavailable": "300s"
}
neon_env_builder.control_plane_compute_hook_api = (
compute_reconfigure_listener.control_plane_compute_hook_api
)
# A small sleep on each call into the notify hook, to simulate the latency of doing a database write
compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
env = neon_env_builder.init_start()
# We will intentionally stress reconciler concurrrency, which triggers a warning when lots
# of shards are hitting the delayed path.
env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
for ps in env.pageservers:
# This can happen because when we do a loop over all pageservers and mark them offline/active,
# reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
# bumping generation before other attachments are detached.
#
# We could clean this up by making reconcilers respect the .observed of their predecessor, if
# we spawn with a wait for the predecessor.
ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
# Storage controller is allowed to drop pageserver requests when the cancellation token
# for a Reconciler fires.
ps.allowed_errors.append(".*request was dropped before completing.*")
# Total tenants
tenant_count = 4000
# Shards per tenant
shard_count = 2
stripe_size = 1024
tenants = set(TenantId.generate() for _i in range(0, tenant_count))
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
def check_memory():
# Shards should be cheap_ in memory, as we will have very many of them
expect_memory_per_shard = 128 * 1024
rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
assert rss is not None
log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
assert rss < expect_memory_per_shard * shard_count * tenant_count
# We use a fixed seed to make the test somewhat reproducible: we want a randomly
# chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
rng = random.Random(1234)
# Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
# permits, to ensure that we are exercising stressing that.
api_concurrency = 135
# We will create tenants directly via API, not via neon_local, to avoid any false
# serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
futs = []
t1 = time.time()
for tenant_id in tenants:
f = executor.submit(
env.storage_controller.tenant_create,
tenant_id,
shard_count,
stripe_size,
placement_policy={"Attached": 1},
)
futs.append(f)
# Wait for creations to finish
for f in futs:
f.result()
log.info(
f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
)
run_ops = api_concurrency * 4
assert run_ops < len(tenants)
op_tenants = list(tenants)[0:run_ops]
# Generate a mixture of operations and dispatch them all concurrently
futs = []
for tenant_id in op_tenants:
op = rng.choice([0, 1, 2])
if op == 0:
# A fan-out write operation to all shards in a tenant (timeline creation)
f = executor.submit(
virtual_ps_http.timeline_create,
PgVersion.NOT_SET,
tenant_id,
TimelineId.generate(),
)
elif op == 1:
# A reconciler operation: migrate a shard.
shard_number = rng.randint(0, shard_count - 1)
tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
f = executor.submit(
env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
)
elif op == 2:
# A passthrough read to shard zero
f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
futs.append(f)
# Wait for mixed ops to finish
for f in futs:
f.result()
# Consistency check is safe here: all the previous operations waited for reconcile before completing
env.storage_controller.consistency_check()
check_memory()
# This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
# how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
# it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
#
# We do not require that the system is quiescent already here, although at present in this point in the test
# that may be the case.
while True:
t1 = time.time()
reconcilers = env.storage_controller.reconcile_all()
if reconcilers == 0:
# Time how long a no-op background reconcile takes: this measures how long it takes to
# loop over all the shards looking for work to do.
runtime = time.time() - t1
log.info(f"No-op call to reconcile_all took {runtime}s")
assert runtime < 1
break
# Restart the storage controller
env.storage_controller.stop()
env.storage_controller.start()
# See how long the controller takes to pass its readiness check. This should be fast because
# all the nodes are online: offline pageservers are the only thing that's allowed to delay
# startup.
readiness_period = env.storage_controller.wait_until_ready()
assert readiness_period < 5
# Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
# to run, as it was in a stable state before restart. If it did, that's a bug.
env.storage_controller.consistency_check()
check_memory()
# Restart pageservers: this exercises the /re-attach API
for pageserver in env.pageservers:
pageserver.stop()
pageserver.start()
# Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
# as they were not offline long enough to trigger any scheduling changes.
env.storage_controller.consistency_check()
check_memory()
# Stop the storage controller before tearing down fixtures, because it otherwise might log
# errors trying to call our `ComputeReconfigure`.
env.storage_controller.stop()

View File

@@ -928,8 +928,6 @@ def test_sharding_split_failures(
".*Reconcile error: receive body: error sending request for url.*",
# Node offline cases will fail inside reconciler when detaching secondaries
".*Reconcile error on shard.*: receive body: error sending request for url.*",
# Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline
".*Reconcile error.*Cancelled.*",
# While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
]

View File

@@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_idle_reconnections")
timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
def collect_stats() -> Dict[str, float]:
# we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
@@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
collect_stats()
endpoint = env.endpoints.create_start("test_idle_reconnections")
endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
# just write something to the timeline
endpoint.safe_psql("create table t(i int)")
collect_stats()
@@ -2007,47 +2007,3 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
)
log.info(f"dump_control_file response: {res}")
assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
# Test disables periodic pushes from safekeeper to the broker and checks that
# pageserver can still discover safekeepers with discovery requests.
def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_broker_discovery")
endpoint = env.endpoints.create_start(
"test_broker_discovery",
config_lines=["shared_buffers=1MB"],
)
endpoint.safe_psql("create table t(i int, payload text)")
# Install extension containing function needed to clear buffer
endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
def do_something():
time.sleep(1)
# generate some data to commit WAL on safekeepers
endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
# clear the buffers
endpoint.safe_psql("select clear_buffer_cache()")
# read data to fetch pages from pageserver
endpoint.safe_psql("select sum(i) from t")
do_something()
do_something()
for sk in env.safekeepers:
# Disable periodic broker push, so pageserver won't be able to discover
# safekeepers without sending a discovery request
sk.stop().start(extra_opts=["--disable-periodic-broker-push"])
do_something()
do_something()
# restart pageserver and check how everything works
env.pageserver.stop().start()
do_something()
do_something()