Compare commits

..

8 Commits

Author SHA1 Message Date
Arpad Müller
bcfab8224c Merge commit 'd9b55732f5a4b6a6cca1df4ae3a13ed656d44697' into arpad/remove_dir_ignore_races 2024-01-19 21:59:30 +01:00
Arpad Müller
7acfbb00f9 fix 2024-01-19 21:59:24 +01:00
Arpad Müller
e73c6495dc Remove the directory as well 2024-01-19 21:55:04 +01:00
Arpad Müller
403a128aa4 Use it where we need directory removal, not where we need file removal 2024-01-19 20:56:56 +01:00
Arpad Müller
d9b55732f5 Duplicate the test to try to reproduce the issue 2024-01-19 16:43:08 +01:00
Arpad Müller
ec8c0206f1 Fix clippy 2024-01-17 19:02:54 +01:00
Arpad Müller
4ee7ba23d0 Make the function sync and not use backoff 2024-01-13 10:17:03 +01:00
Arpad Müller
6eb01fb596 retrying version of remove_dir_all 2024-01-13 10:17:03 +01:00
48 changed files with 789 additions and 1152 deletions

1
Cargo.lock generated
View File

@@ -3991,7 +3991,6 @@ dependencies = [
"url",
"utils",
"uuid",
"walkdir",
"webpki-roots 0.25.2",
"workspace_hack",
"x509-parser",

View File

@@ -883,10 +883,8 @@ FROM debian:bullseye-slim
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
mkdir /var/db/postgres/pgbouncer && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
chmod 0750 /var/db/postgres/pgbouncer && \
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
# create folder for file cache
mkdir -p -m 777 /neon/cache

View File

@@ -32,6 +32,8 @@
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
//! ```
//!
use std::collections::HashMap;
@@ -110,6 +112,9 @@ fn main() -> Result<()> {
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -220,6 +225,8 @@ fn main() -> Result<()> {
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
};
let compute = Arc::new(compute_node);
@@ -516,6 +523,23 @@ fn cli() -> clap::Command {
)
.value_name("FILECACHE_CONNSTR"),
)
.arg(
Arg::new("pgbouncer-connstr")
.long("pgbouncer-connstr")
.default_value(
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
)
.value_name("PGBOUNCER_CONNSTR"),
)
.arg(
Arg::new("pgbouncer-ini-path")
.long("pgbouncer-ini-path")
// Note: this doesn't match current path for pgbouncer.ini.
// Until we fix it, we need to pass the path explicitly
// or this will be effectively no-op.
.default_value("/etc/pgbouncer.ini")
.value_name("PGBOUNCER_INI_PATH"),
)
}
/// When compute_ctl is killed, send also termination signal to sync-safekeepers

View File

@@ -71,6 +71,10 @@ pub struct ComputeNode {
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub build_tag: String,
// connection string to pgbouncer to change settings
pub pgbouncer_connstr: Option<String>,
// path to pgbouncer.ini to change settings
pub pgbouncer_ini_path: Option<String>,
}
// store some metrics about download size that might impact startup time
@@ -765,8 +769,8 @@ impl ComputeNode {
pub fn reconfigure(&self) -> Result<()> {
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
info!("tuning pgbouncer");
if let Some(connstr) = &self.pgbouncer_connstr {
info!("tuning pgbouncer with connstr: {:?}", connstr);
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
@@ -775,9 +779,15 @@ impl ComputeNode {
// Spawn a thread to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
let pgbouncer_settings = spec.pgbouncer_settings.clone();
let connstr_clone = connstr.clone();
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
let _handle = thread::spawn(move || {
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
let res = rt.block_on(tune_pgbouncer(
pgbouncer_settings,
&connstr_clone,
pgbouncer_ini_path,
));
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}
@@ -842,8 +852,8 @@ impl ComputeNode {
);
// tune pgbouncer
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
info!("tuning pgbouncer");
if let Some(connstr) = &self.pgbouncer_connstr {
info!("tuning pgbouncer with connstr: {:?}", connstr);
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
@@ -852,9 +862,15 @@ impl ComputeNode {
// Spawn a thread to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pgbouncer_settings.clone();
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
let connstr_clone = connstr.clone();
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
let _handle = thread::spawn(move || {
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
let res = rt.block_on(tune_pgbouncer(
pgbouncer_settings,
&connstr_clone,
pgbouncer_ini_path,
));
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}

View File

@@ -366,7 +366,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
}
/// Update pgbouncer.ini with provided options
fn update_pgbouncer_ini(
pub fn update_pgbouncer_ini(
pgbouncer_config: HashMap<String, String>,
pgbouncer_ini_path: &str,
) -> Result<()> {
@@ -375,10 +375,6 @@ fn update_pgbouncer_ini(
for (option_name, value) in pgbouncer_config.iter() {
section.insert(option_name, value);
debug!(
"Updating pgbouncer.ini with new values {}={}",
option_name, value
);
}
conf.write_to_file(pgbouncer_ini_path)?;
@@ -388,79 +384,48 @@ fn update_pgbouncer_ini(
/// Tune pgbouncer.
/// 1. Apply new config using pgbouncer admin console
/// 2. Add new values to pgbouncer.ini to preserve them after restart
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
// for VMs use pgbouncer specific way to connect to
// pgbouncer admin console without password
// when pgbouncer is running under the same user.
"host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
} else {
// for k8s use normal connection string with password
// to connect to pgbouncer admin console
let mut pgbouncer_connstr =
"host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
}
pgbouncer_connstr
};
info!(
"Connecting to pgbouncer with connection string: {}",
pgbouncer_connstr
);
// connect to pgbouncer, retrying several times
// because pgbouncer may not be ready yet
let mut retries = 3;
let client = loop {
match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
Ok((client, connection)) => {
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
break client;
pub async fn tune_pgbouncer(
pgbouncer_settings: Option<HashMap<String, String>>,
pgbouncer_connstr: &str,
pgbouncer_ini_path: Option<String>,
) -> Result<()> {
if let Some(pgbouncer_config) = pgbouncer_settings {
// Apply new config
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
let (client, connection) = connect_result.unwrap();
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
Err(e) => {
if retries == 0 {
return Err(e.into());
}
error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
retries -= 1;
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
};
});
// Apply new config
for (option_name, value) in pgbouncer_config.iter() {
let query = format!("SET {}={}", option_name, value);
// keep this log line for debugging purposes
info!("Applying pgbouncer setting change: {}", query);
if let Err(err) = client.simple_query(&query).await {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
for (option_name, value) in pgbouncer_config.iter() {
info!(
"Applying pgbouncer setting change: {} = {}",
option_name, value
);
};
}
let query = format!("SET {} = {}", option_name, value);
// save values to pgbouncer.ini
// so that they are preserved after pgbouncer restart
let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
// in VMs we use /etc/pgbouncer.ini
"/etc/pgbouncer.ini".to_string()
} else {
// in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
// this is a shared volume between pgbouncer and postgres containers
// FIXME: fix permissions for this file
"/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
};
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
let result = client.simple_query(&query).await;
info!("Applying pgbouncer setting change: {}", query);
info!("pgbouncer setting change result: {:?}", result);
if let Err(err) = result {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
);
};
}
// save values to pgbouncer.ini
// so that they are preserved after pgbouncer restart
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
}
}
Ok(())
}

View File

@@ -38,6 +38,40 @@ pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
Ok(content)
}
/// Version of [`std::fs::remove_dir_all`] that is idempotent and tolerates parallel removals of the same path or sub-paths
///
/// The idempotency implies that we return `Ok(())`` even if the file is already gone or has never existed,
/// unlike `remove_dir_all` from std/tokio.
pub fn remove_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
fn strip_not_found<T>(v: io::Result<T>) -> Option<io::Result<T>> {
match v {
Err(e) if e.kind() == io::ErrorKind::NotFound => None,
other => Some(other),
}
}
let Some(list) = strip_not_found(std::fs::read_dir(&path)) else {
return Ok(());
};
for entry in list? {
let Some(entry) = strip_not_found(entry) else {
continue;
};
let entry = entry?;
let Some(file_type) = strip_not_found(entry.file_type()) else {
continue;
};
if file_type?.is_dir() {
remove_dir_all(entry.path())?;
} else {
strip_not_found(std::fs::remove_file(entry.path())).unwrap_or(Ok(()))?;
}
}
if let Some(res) = strip_not_found(std::fs::remove_dir(path)) {
res?;
}
Ok(())
}
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
if e.kind() == io::ErrorKind::NotFound {
Ok(())

View File

@@ -108,32 +108,9 @@ pub struct RelTagBlockNo {
}
impl PagestreamClient {
pub async fn shutdown(self) {
let Self {
copy_both,
cancel_on_client_drop: cancel_conn_task,
conn_task,
} = self;
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
//
// If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
// the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
//
// Further, the pageserver makes a lot of noise when it receives CopyFail.
// Computes don't send it in practice, they just hard-close the connection.
//
// So, let's behave like the computes and suppress the CopyFail as follows:
// kill the socket first, then drop copy_both.
//
// See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
//
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
// => https://github.com/neondatabase/neon/issues/6390
let _ = cancel_conn_task.unwrap();
conn_task.await.unwrap();
drop(copy_both);
pub async fn shutdown(mut self) {
let _ = self.cancel_on_client_drop.take();
self.conn_task.await.unwrap();
}
pub async fn getpage(

View File

@@ -404,27 +404,23 @@ async fn client(
.await
.unwrap();
let do_requests = async {
start_work_barrier.wait().await;
while let Some(req) = work.recv().await {
let start = Instant::now();
client
.getpage(req)
.await
.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
};
tokio::select! {
res = do_requests => { res },
_ = cancel.cancelled() => {
client.shutdown().await;
return;
}
start_work_barrier.wait().await;
while let Some(req) =
tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
{
let start = Instant::now();
let res = tokio::select! {
res = client.getpage(req) => { res },
_ = cancel.cancelled() => { return; }
};
res.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
}

View File

@@ -35,7 +35,6 @@ fn main() {
logging::Output::Stderr,
)
.unwrap();
logging::replace_panic_hook_with_tracing_panic_hook().forget();
let args = Args::parse();
match args {

View File

@@ -386,56 +386,39 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut warned = None;
let mut usage_planned = usage_pre;
let mut evicted_amount = 0;
let selection = select_victims(&candidates, usage_pre);
let mut candidates = candidates;
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
// for comparison here. this is a temporary measure to develop alternatives.
use std::fmt::Write;
let mut summary_buf = String::with_capacity(256);
{
let absolute_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{absolute_summary}").expect("string grows");
info!("absolute accessed selection summary: {summary_buf}");
for (i, (partition, candidate)) in candidates.iter().enumerate() {
if !usage_planned.has_pressure() {
debug!(
no_candidates_evicted = i,
"took enough candidates for pressure to be relieved"
);
break;
}
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
let selection = select_victims(&candidates, usage_pre);
{
summary_buf.clear();
let relative_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{relative_summary}").expect("string grows");
info!("relative accessed selection summary: {summary_buf}");
if partition == &MinResidentSizePartition::Below && warned.is_none() {
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
warned = Some(usage_planned);
}
selection
} else {
selection
usage_planned.add_available_bytes(candidate.layer.get_file_size());
evicted_amount += 1;
}
let usage_planned = match warned {
Some(respecting_tenant_min_resident_size) => PlannedUsage {
respecting_tenant_min_resident_size,
fallback_to_global_lru: Some(usage_planned),
},
None => PlannedUsage {
respecting_tenant_min_resident_size: usage_planned,
fallback_to_global_lru: None,
},
};
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
debug!(?usage_planned, "usage planned");
// phase2: evict layers
@@ -813,16 +796,14 @@ async fn collect_eviction_candidates(
// A default override can be put in the default tenant conf in the pageserver.toml.
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
debug!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
tenant_id=%tenant.tenant_id(),
overridden_size=s,
"using overridden min resident size for tenant"
);
s
} else {
debug!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
tenant_id=%tenant.tenant_id(),
max_layer_size,
"using max layer size as min_resident_size for tenant",
);
@@ -927,80 +908,22 @@ async fn collect_eviction_candidates(
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
// will sort later by candidate.relative_last_activity to get compare evictions.
candidates
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
match eviction_order {
EvictionOrder::AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
EvictionOrder::RelativeAccessed { .. } => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
}
}
Ok(EvictionCandidates::Finished(candidates))
}
/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
/// relieve pressure.
///
/// Returns the amount of candidates selected, with the planned usage.
fn select_victims<U: Usage>(
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
usage_pre: U,
) -> VictimSelection<U> {
let mut usage_when_switched = None;
let mut usage_planned = usage_pre;
let mut evicted_amount = 0;
for (i, (partition, candidate)) in candidates.iter().enumerate() {
if !usage_planned.has_pressure() {
break;
}
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
usage_when_switched = Some((usage_planned, i));
}
usage_planned.add_available_bytes(candidate.layer.get_file_size());
evicted_amount += 1;
}
VictimSelection {
amount: evicted_amount,
usage_pre,
usage_when_switched,
usage_planned,
}
}
struct VictimSelection<U> {
amount: usize,
usage_pre: U,
usage_when_switched: Option<(U, usize)>,
usage_planned: U,
}
impl<U: Usage> VictimSelection<U> {
fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
debug!(
evicted_amount=%self.amount,
"took enough candidates for pressure to be relieved"
);
if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
}
let planned = match self.usage_when_switched {
Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
respecting_tenant_min_resident_size,
fallback_to_global_lru: Some(self.usage_planned),
},
None => PlannedUsage {
respecting_tenant_min_resident_size: self.usage_planned,
fallback_to_global_lru: None,
},
};
(self.amount, planned)
}
}
struct TimelineKey(Arc<Timeline>);
impl PartialEq for TimelineKey {
@@ -1085,137 +1008,6 @@ pub(crate) mod finite_f32 {
}
}
mod summary {
use super::finite_f32::FiniteF32;
use super::{EvictionCandidate, LayerCount};
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, HashMap};
use std::time::SystemTime;
#[derive(Debug, Default)]
pub(super) struct EvictionSummary {
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
total: LayerCount,
last_absolute: Option<SystemTime>,
last_relative: Option<FiniteF32>,
}
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
let mut summary = EvictionSummary::default();
for item in iter {
let counts = summary
.evicted_per_tenant
.entry(*item.layer.get_tenant_shard_id())
.or_default();
let sz = item.layer.get_file_size();
counts.file_sizes += sz;
counts.count += 1;
summary.total.file_sizes += sz;
summary.total.count += 1;
summary.last_absolute = Some(item.last_activity_ts);
summary.last_relative = Some(item.relative_last_activity);
}
summary
}
}
struct SiBytesAmount(u64);
impl std::fmt::Display for SiBytesAmount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0 < 1024 {
return write!(f, "{}B", self.0);
}
let mut tmp = self.0;
let mut ch = 0;
let suffixes = b"KMGTPE";
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
tmp /= 1024;
ch += 1;
}
let ch = suffixes[ch] as char;
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
}
}
impl std::fmt::Display for EvictionSummary {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// wasteful, but it's for testing
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
for (tenant_shard_id, count) in &self.evicted_per_tenant {
sorted
.entry(count.count)
.or_default()
.push((*tenant_shard_id, count.file_sizes));
}
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
writeln!(
f,
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
self.total.count, self.last_absolute, self.last_relative,
)?;
for (count, per_tenant) in sorted.iter().rev().take(10) {
write!(f, "- {count} layers: ")?;
if per_tenant.len() < 3 {
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
let bytes = SiBytesAmount(*bytes);
write!(f, "{tenant_shard_id} ({bytes})")?;
}
} else {
let num_tenants = per_tenant.len();
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
let total_bytes = SiBytesAmount(total_bytes);
let layers = num_tenants * count;
write!(
f,
"{num_tenants} tenants {total_bytes} in total {layers} layers",
)?;
}
writeln!(f)?;
}
if sorted.len() > 10 {
let (rem_count, rem_bytes) = sorted
.iter()
.rev()
.map(|(count, per_tenant)| {
(
count,
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
)
})
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
let rem_bytes = SiBytesAmount(rem_bytes);
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
}
Ok(())
}
}
}
mod filesystem_level_usage {
use anyhow::Context;
use camino::Utf8Path;

View File

@@ -1236,7 +1236,7 @@ async fn tenant_create_handler(
json_response(
StatusCode::CREATED,
TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
TenantCreateResponse(new_tenant.tenant_id()),
)
}

View File

@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
use strum::{EnumCount, IntoEnumIterator, VariantNames};
use strum_macros::{EnumVariantNames, IntoStaticStr};
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
/// Prometheus histogram buckets (in seconds) for operations in the critical
/// path. In other words, operations that directly affect that latency of user
@@ -59,7 +59,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
register_counter_vec!(
"pageserver_storage_operations_seconds_sum",
"Total time spent on storage operations with operation, tenant and timeline dimensions",
&["operation", "tenant_id", "shard_id", "timeline_id"],
&["operation", "tenant_id", "timeline_id"],
)
.expect("failed to define a metric")
});
@@ -68,7 +68,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
register_int_counter_vec!(
"pageserver_storage_operations_seconds_count",
"Count of storage operations with operation, tenant and timeline dimensions",
&["operation", "tenant_id", "shard_id", "timeline_id"],
&["operation", "tenant_id", "timeline_id"],
)
.expect("failed to define a metric")
});
@@ -373,7 +373,7 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_last_record_lsn",
"Last record LSN grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -382,7 +382,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_resident_physical_size",
"The size of the layer files present in the pageserver's filesystem.",
&["tenant_id", "shard_id", "timeline_id"]
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -400,7 +400,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
"pageserver_remote_physical_size",
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
// Corollary: If any files are missing from the index part, they won't be included here.
&["tenant_id", "shard_id", "timeline_id"]
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -433,7 +433,7 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_current_logical_size",
"Current logical size grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
&["tenant_id", "timeline_id"]
)
.expect("failed to define current logical size metric")
});
@@ -582,7 +582,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_broken_tenants_count",
"Set of broken tenants",
&["tenant_id", "shard_id"]
&["tenant_id"]
)
.expect("Failed to register pageserver_tenant_states_count metric")
});
@@ -602,7 +602,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_created_persistent_files_total",
"Number of files created that are meant to be uploaded to cloud storage",
&["tenant_id", "shard_id", "timeline_id"]
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -611,7 +611,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_written_persistent_bytes_total",
"Total bytes written that are meant to be uploaded to cloud storage",
&["tenant_id", "shard_id", "timeline_id"]
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -630,7 +630,7 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_evictions",
"Number of layers evicted from the pageserver",
&["tenant_id", "shard_id", "timeline_id"]
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -927,7 +927,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_io_operations_bytes_total",
"Total amount of bytes read/written in IO operations",
&["operation", "tenant_id", "shard_id", "timeline_id"]
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -1002,7 +1002,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_smgr_query_seconds",
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
&["smgr_query_type", "tenant_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
@@ -1069,9 +1069,8 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
});
impl SmgrQueryTimePerTimeline {
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let metrics = std::array::from_fn(|i| {
let op = SmgrQueryType::from_repr(i).unwrap();
@@ -1079,7 +1078,7 @@ impl SmgrQueryTimePerTimeline {
.get_metric_with_label_values(&[op.into()])
.unwrap();
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
.get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
.unwrap();
GlobalAndPerTimelineHistogram {
global,
@@ -1099,7 +1098,6 @@ impl SmgrQueryTimePerTimeline {
#[cfg(test)]
mod smgr_query_time_tests {
use pageserver_api::shard::TenantShardId;
use strum::IntoEnumIterator;
use utils::id::{TenantId, TimelineId};
@@ -1126,10 +1124,7 @@ mod smgr_query_time_tests {
for op in &ops {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let metrics = super::SmgrQueryTimePerTimeline::new(
&TenantShardId::unsharded(tenant_id),
&timeline_id,
);
let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
let get_counts = || {
let global: u64 = ops
@@ -1210,13 +1205,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
"Number of ongoing calls to remote timeline client. \
Used to populate pageserver_remote_timeline_client_calls_started. \
This metric is not useful for sampling from Prometheus, but useful in tests.",
&[
"tenant_id",
"shard_id",
"timeline_id",
"file_kind",
"op_kind"
],
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
)
.expect("failed to define a metric")
});
@@ -1237,23 +1226,22 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
.expect("failed to define a metric")
});
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_remote_timeline_client_bytes_started",
"Incremented by the number of bytes associated with a remote timeline client operation. \
The increment happens when the operation is scheduled.",
&["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
)
.expect("failed to define a metric")
});
.expect("failed to define a metric")
});
static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_remote_timeline_client_bytes_finished",
"Incremented by the number of bytes associated with a remote timeline client operation. \
The increment happens when the operation finishes (regardless of success/failure/shutdown).",
&["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
)
.expect("failed to define a metric")
});
@@ -1699,19 +1687,14 @@ pub(crate) struct StorageTimeMetrics {
}
impl StorageTimeMetrics {
pub fn new(
operation: StorageTimeOperation,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
) -> Self {
pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
let operation: &'static str = operation.into();
let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
.get_metric_with_label_values(&[operation, tenant_id, timeline_id])
.unwrap();
let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
.get_metric_with_label_values(&[operation, tenant_id, timeline_id])
.unwrap();
let global_histogram = STORAGE_TIME_GLOBAL
.get_metric_with_label_values(&[operation])
@@ -1763,66 +1746,40 @@ impl TimelineMetrics {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let timeline_id = timeline_id.to_string();
let flush_time_histo = StorageTimeMetrics::new(
StorageTimeOperation::LayerFlush,
&tenant_id,
&shard_id,
&timeline_id,
);
let compact_time_histo = StorageTimeMetrics::new(
StorageTimeOperation::Compact,
&tenant_id,
&shard_id,
&timeline_id,
);
let create_images_time_histo = StorageTimeMetrics::new(
StorageTimeOperation::CreateImages,
&tenant_id,
&shard_id,
&timeline_id,
);
let logical_size_histo = StorageTimeMetrics::new(
StorageTimeOperation::LogicalSize,
&tenant_id,
&shard_id,
&timeline_id,
);
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
let compact_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
let create_images_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
let logical_size_histo =
StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
let imitate_logical_size_histo = StorageTimeMetrics::new(
StorageTimeOperation::ImitateLogicalSize,
&tenant_id,
&shard_id,
&timeline_id,
);
let load_layer_map_histo = StorageTimeMetrics::new(
StorageTimeOperation::LoadLayerMap,
&tenant_id,
&shard_id,
&timeline_id,
);
let garbage_collect_histo = StorageTimeMetrics::new(
StorageTimeOperation::Gc,
&tenant_id,
&shard_id,
&timeline_id,
);
let load_layer_map_histo =
StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
let garbage_collect_histo =
StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
// TODO: we shouldn't expose this metric
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
.build(&tenant_id, &shard_id, &timeline_id);
@@ -1876,17 +1833,15 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ =
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
}
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ =
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
@@ -1899,42 +1854,29 @@ impl Drop for TimelineMetrics {
// outlive an individual smgr connection, but not the timeline.
for op in StorageTimeOperation::VARIANTS {
let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
op,
tenant_id,
shard_id,
timeline_id,
]);
let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
op,
tenant_id,
shard_id,
timeline_id,
]);
let _ =
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
let _ =
STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
}
for op in STORAGE_IO_SIZE_OPERATIONS {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
}
for op in SmgrQueryType::iter() {
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
op.into(),
tenant_id,
shard_id,
timeline_id,
]);
}
}
}
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
// Only shard zero deals in synthetic sizes
if tenant_shard_id.is_zero() {
let tid = tenant_shard_id.tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
}
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
let tid = tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
// we leave the BROKEN_TENANTS_SET entry if any
}
@@ -1984,7 +1926,6 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
pub(crate) struct RemoteTimelineClientMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
@@ -1996,7 +1937,6 @@ impl RemoteTimelineClientMetrics {
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
RemoteTimelineClientMetrics {
tenant_id: tenant_shard_id.tenant_id.to_string(),
shard_id: format!("{}", tenant_shard_id.shard_slug()),
timeline_id: timeline_id.to_string(),
calls_unfinished_gauge: Mutex::new(HashMap::default()),
bytes_started_counter: Mutex::new(HashMap::default()),
@@ -2011,9 +1951,8 @@ impl RemoteTimelineClientMetrics {
PerTimelineRemotePhysicalSizeGauge::new(
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
])
.unwrap(),
)
@@ -2048,9 +1987,8 @@ impl RemoteTimelineClientMetrics {
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
.get_metric_with_label_values(&[
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
@@ -2080,9 +2018,8 @@ impl RemoteTimelineClientMetrics {
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
.get_metric_with_label_values(&[
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
@@ -2101,9 +2038,8 @@ impl RemoteTimelineClientMetrics {
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
.get_metric_with_label_values(&[
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
@@ -2247,7 +2183,6 @@ impl Drop for RemoteTimelineClientMetrics {
fn drop(&mut self) {
let RemoteTimelineClientMetrics {
tenant_id,
shard_id,
timeline_id,
remote_physical_size_gauge,
calls_unfinished_gauge,
@@ -2257,7 +2192,6 @@ impl Drop for RemoteTimelineClientMetrics {
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
a,
b,
@@ -2266,7 +2200,6 @@ impl Drop for RemoteTimelineClientMetrics {
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
a,
b,
@@ -2275,7 +2208,6 @@ impl Drop for RemoteTimelineClientMetrics {
for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
a,
b,
@@ -2283,7 +2215,7 @@ impl Drop for RemoteTimelineClientMetrics {
}
{
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
}
}
}
@@ -2293,6 +2225,8 @@ impl Drop for RemoteTimelineClientMetrics {
pub(crate) trait MeasureRemoteOp: Sized {
fn measure_remote_op(
self,
tenant_id: TenantId,
timeline_id: TimelineId,
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
metrics: Arc<RemoteTimelineClientMetrics>,
@@ -2300,6 +2234,8 @@ pub(crate) trait MeasureRemoteOp: Sized {
let start = Instant::now();
MeasuredRemoteOp {
inner: self,
tenant_id,
timeline_id,
file_kind,
op,
start,
@@ -2315,6 +2251,8 @@ pin_project! {
{
#[pin]
inner: F,
tenant_id: TenantId,
timeline_id: TimelineId,
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
start: Instant,

View File

@@ -384,17 +384,11 @@ impl PageServerHandler {
}
}
/// Future that completes when we need to shut down the connection.
/// Analogous to calling cancelled() on a Timeline's cancellation token: waits for cancellation.
///
/// Reasons for need to shut down are:
/// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
/// - task_mgr requests shutdown of the connection
///
/// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
/// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
///
/// NB: keep in sync with [`Self::is_connection_cancelled`]
async fn await_connection_cancelled(&self) {
/// We use many Timeline objects, and hold GateGuards on all of them. We must therefore respect
/// all of their cancellation tokens.
async fn timeline_cancelled(&self) {
// A short wait before we expend the cycles to walk our timeline map. This avoids incurring
// that cost every time we check for cancellation.
tokio::time::sleep(Duration::from_millis(10)).await;
@@ -410,19 +404,14 @@ impl PageServerHandler {
.map(|ht| ht.timeline.cancel.cancelled())
.collect::<FuturesUnordered<_>>();
tokio::select! {
_ = task_mgr::shutdown_watcher() => { }
_ = futs.next() => {}
}
futs.next().await;
}
/// Checking variant of [`Self::await_connection_cancelled`].
fn is_connection_cancelled(&self) -> bool {
task_mgr::is_shutdown_requested()
|| self
.shard_timelines
.values()
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
/// Analogous to calling is_cancelled() on a Timeline's cancellation token
fn timeline_is_cancelled(&self) -> bool {
self.shard_timelines
.values()
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
}
/// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in
@@ -443,7 +432,7 @@ impl PageServerHandler {
flush_r = pgb.flush() => {
Ok(flush_r?)
},
_ = self.await_connection_cancelled() => {
_ = self.timeline_cancelled() => {
Err(QueryError::Shutdown)
}
_ = cancel.cancelled() => {
@@ -556,11 +545,13 @@ impl PageServerHandler {
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
self.flush_cancellable(pgb, &tenant.cancel).await?;
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
loop {
let msg = tokio::select! {
biased;
_ = self.await_connection_cancelled() => {
_ = self.timeline_cancelled() => {
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
@@ -594,6 +585,7 @@ impl PageServerHandler {
let (response, span) = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
(
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -603,6 +595,7 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::Nblocks(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
(
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -612,6 +605,7 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::GetPage(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
(
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
@@ -621,6 +615,7 @@ impl PageServerHandler {
)
}
PagestreamFeMessage::DbSize(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
(
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -643,7 +638,7 @@ impl PageServerHandler {
span.in_scope(|| info!("handler requested reconnect: {reason}"));
return Err(QueryError::Reconnect);
}
Err(e) if self.is_connection_cancelled() => {
Err(e) if self.timeline_is_cancelled() => {
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
//
@@ -870,9 +865,6 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetRelExists);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
@@ -896,11 +888,6 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetRelSize);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -923,11 +910,6 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetDbSize);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -1098,10 +1080,6 @@ impl PageServerHandler {
}
};
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)

View File

@@ -112,7 +112,7 @@ use toml_edit;
use utils::{
crashsafe,
generation::Generation,
id::TimelineId,
id::{TenantId, TimelineId},
lsn::{Lsn, RecordLsn},
};
@@ -371,13 +371,13 @@ impl WalRedoManager {
pub enum GetTimelineError {
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
NotActive {
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
state: TimelineState,
},
#[error("Timeline {tenant_id}/{timeline_id} was not found")]
NotFound {
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
},
}
@@ -1517,6 +1517,10 @@ impl Tenant {
.map_err(LoadLocalTimelineError::Load)
}
pub(crate) fn tenant_id(&self) -> TenantId {
self.tenant_shard_id.tenant_id
}
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
self.tenant_shard_id
}
@@ -1532,13 +1536,13 @@ impl Tenant {
let timeline = timelines_accessor
.get(&timeline_id)
.ok_or(GetTimelineError::NotFound {
tenant_id: self.tenant_shard_id,
tenant_id: self.tenant_shard_id.tenant_id,
timeline_id,
})?;
if active_only && !timeline.is_active() {
Err(GetTimelineError::NotActive {
tenant_id: self.tenant_shard_id,
tenant_id: self.tenant_shard_id.tenant_id,
timeline_id,
state: timeline.current_state(),
})
@@ -2593,9 +2597,7 @@ impl Tenant {
let (state, mut rx) = watch::channel(state);
tokio::spawn(async move {
// Strings for metric labels
let tid = tenant_shard_id.to_string();
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2608,15 +2610,13 @@ impl Tenant {
// the tenant might be ignored and reloaded, so first remove any previous set
// element. it most likely has already been scraped, as these are manual operations
// right now. most likely we will add it back very soon.
drop(
crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
);
drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
false
} else {
// add the id to the set right away, there should not be any updates on the channel
// after
crate::metrics::BROKEN_TENANTS_SET
.with_label_values(&[&tid, &shard_id_str])
.with_label_values(&[&tid])
.set(1);
true
};
@@ -2642,7 +2642,7 @@ impl Tenant {
counted_broken = true;
// insert the tenant_id (back) into the set
crate::metrics::BROKEN_TENANTS_SET
.with_label_values(&[&tid, &shard_id_str])
.with_label_values(&[&tid])
.inc();
}
}
@@ -3290,7 +3290,7 @@ impl Tenant {
}
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
scopeguard::defer! {
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
if let Err(e) = utils::fs_ext::remove_dir_all(&pgdata_path) {
// this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
}
@@ -3629,9 +3629,6 @@ impl Tenant {
self.cached_synthetic_tenant_size
.store(size, Ordering::Relaxed);
// Only shard zero should be calculating synthetic sizes
debug_assert!(self.shard_identity.is_zero());
TENANT_SYNTHETIC_SIZE_METRIC
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
.unwrap()
@@ -3783,7 +3780,7 @@ async fn run_initdb(
impl Drop for Tenant {
fn drop(&mut self) {
remove_tenant_metrics(&self.tenant_shard_id);
remove_tenant_metrics(&self.tenant_shard_id.tenant_id);
}
}
/// Dump contents of a layer file to stdout.
@@ -5211,7 +5208,7 @@ mod tests {
assert_eq!(
e,
GetTimelineError::NotFound {
tenant_id: tenant.tenant_shard_id,
tenant_id: tenant.tenant_shard_id.tenant_id,
timeline_id: TIMELINE_ID,
}
)

View File

@@ -847,13 +847,15 @@ impl TenantManager {
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_shard_id))
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
},
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary(_)) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
@@ -1304,13 +1306,10 @@ impl TenantManager {
#[derive(Debug, thiserror::Error)]
pub(crate) enum GetTenantError {
/// NotFound is a TenantId rather than TenantShardId, because this error type is used from
/// getters that use a TenantId and a ShardSelector, not just getters that target a specific shard.
#[error("Tenant {0} not found")]
NotFound(TenantId),
#[error("Tenant {0} is not active")]
NotActive(TenantShardId),
NotActive(TenantId),
/// Broken is logically a subset of NotActive, but a distinct error is useful as
/// NotActive is usually a retryable state for API purposes, whereas Broken
/// is a stuck error state
@@ -1343,13 +1342,15 @@ pub(crate) fn get_tenant(
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_shard_id))
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
},
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary(_)) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
@@ -1425,7 +1426,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
}
Some(TenantSlot::Secondary(_)) => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
tenant_shard_id,
tenant_id,
)))
}
Some(TenantSlot::InProgress(barrier)) => {
@@ -1464,7 +1465,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
Some(TenantSlot::Attached(tenant)) => tenant.clone(),
_ => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
tenant_shard_id,
tenant_id,
)))
}
}
@@ -1492,7 +1493,7 @@ pub(crate) enum DeleteTimelineError {
#[derive(Debug, thiserror::Error)]
pub(crate) enum TenantStateError {
#[error("Tenant {0} is stopping")]
IsStopping(TenantShardId),
IsStopping(TenantId),
#[error(transparent)]
SlotError(#[from] TenantSlotError),
#[error(transparent)]
@@ -2122,7 +2123,7 @@ where
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
// wait for it but return an error right away because these are distinct requests.
slot_guard.revert();
return Err(TenantStateError::IsStopping(tenant_shard_id));
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
}
}
Some(tenant)
@@ -2251,6 +2252,7 @@ pub(crate) async fn immediate_gc(
#[cfg(test)]
mod tests {
use pageserver_api::shard::TenantShardId;
use std::collections::BTreeMap;
use std::sync::Arc;
use tracing::{info_span, Instrument};
@@ -2271,7 +2273,7 @@ mod tests {
// harness loads it to active, which is forced and nothing is running on the tenant
let id = t.tenant_shard_id();
let id = TenantShardId::unsharded(t.tenant_id());
// tenant harness configures the logging and we cannot escape it
let _e = info_span!("testing", tenant_id = %id).entered();

View File

@@ -522,6 +522,8 @@ impl RemoteTimelineClient {
cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
@@ -564,6 +566,8 @@ impl RemoteTimelineClient {
cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
@@ -1347,6 +1351,8 @@ impl RemoteTimelineClient {
&self.cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
@@ -1372,6 +1378,8 @@ impl RemoteTimelineClient {
&self.cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),

View File

@@ -252,10 +252,6 @@ pub struct Timeline {
pub(super) metrics: TimelineMetrics,
// `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code
// in `crate::page_service` writes these metrics.
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
/// Ensures layers aren't frozen by checkpointer between
/// [`Timeline::get_layer_for_write`] and layer reads.
/// Locked automatically by [`TimelineWriter`] and checkpointer.
@@ -1319,11 +1315,6 @@ impl Timeline {
),
),
query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
&tenant_shard_id,
&timeline_id,
),
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
layer_flush_start_tx,

View File

@@ -14,7 +14,6 @@ use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC
use crate::tenant::TENANTS_SEGMENT_NAME;
use camino::{Utf8Path, Utf8PathBuf};
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use std::fs::{self, File, OpenOptions};
use std::io::{Error, ErrorKind, Seek, SeekFrom};
use std::os::unix::fs::FileExt;
@@ -61,7 +60,6 @@ pub struct VirtualFile {
// It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
// strings.
tenant_id: String,
shard_id: String,
timeline_id: String,
}
@@ -303,24 +301,15 @@ impl VirtualFile {
) -> Result<VirtualFile, std::io::Error> {
let path_str = path.to_string();
let parts = path_str.split('/').collect::<Vec<&str>>();
let (tenant_id, shard_id, timeline_id) =
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
let tenant_shard_part = parts[parts.len() - 4];
let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
Ok(tenant_shard_id) => (
tenant_shard_id.tenant_id.to_string(),
format!("{}", tenant_shard_id.shard_slug()),
),
Err(_) => {
// Malformed path: this ID is just for observability, so tolerate it
// and pass through
(tenant_shard_part.to_string(), "*".to_string())
}
};
(tenant_id, shard_id, parts[parts.len() - 2].to_string())
} else {
("*".to_string(), "*".to_string(), "*".to_string())
};
let tenant_id;
let timeline_id;
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
tenant_id = parts[parts.len() - 4].to_string();
timeline_id = parts[parts.len() - 2].to_string();
} else {
tenant_id = "*".to_string();
timeline_id = "*".to_string();
}
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
@@ -344,7 +333,6 @@ impl VirtualFile {
path: path.to_path_buf(),
open_options: reopen_options,
tenant_id,
shard_id,
timeline_id,
};
@@ -586,7 +574,7 @@ impl VirtualFile {
.read_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
.with_label_values(&["read", &self.tenant_id, &self.timeline_id])
.add(size as i64);
}
result
@@ -598,7 +586,7 @@ impl VirtualFile {
.write_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
.with_label_values(&["write", &self.tenant_id, &self.timeline_id])
.add(size as i64);
}
result

View File

@@ -2201,8 +2201,7 @@ mod tests {
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
let (tenant, ctx) = harness.load().await;
let remote_initdb_path =
remote_initdb_archive_path(&tenant.tenant_shard_id().tenant_id, &TIMELINE_ID);
let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
std::fs::create_dir_all(initdb_path.parent().unwrap())

View File

@@ -308,13 +308,13 @@ lfc_change_limit_hook(int newval, void *extra)
Assert(victim->access_count == 0);
#ifdef FALLOC_FL_PUNCH_HOLE
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
neon_log(LOG, "Failed to punch hole in file: %m");
elog(LOG, "Failed to punch hole in file: %m");
#endif
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
lfc_ctl->used -= 1;
}
lfc_ctl->limit = new_size;
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
elog(DEBUG1, "set local file cache limit to %d", new_size);
LWLockRelease(lfc_lock);
}
@@ -327,7 +327,7 @@ lfc_init(void)
* shared_preload_libraries.
*/
if (!process_shared_preload_libraries_in_progress)
neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
DefineCustomIntVariable("neon.max_file_cache_size",
@@ -643,7 +643,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
Assert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
neon_log(DEBUG2, "Swap file cache page");
elog(DEBUG2, "Swap file cache page");
}
else
{
@@ -846,10 +846,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
* wrong) function definition though.
*/
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
neon_log(ERROR, "return type must be a row type");
elog(ERROR, "return type must be a row type");
if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
neon_log(ERROR, "incorrect number of output arguments");
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);

View File

@@ -990,7 +990,7 @@ nm_pack_request(NeonRequest *msg)
case T_NeonErrorResponse:
case T_NeonDbSizeResponse:
default:
neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
break;
}
return s;
@@ -1085,7 +1085,7 @@ nm_unpack_response(StringInfo s)
case T_NeonGetPageRequest:
case T_NeonDbSizeRequest:
default:
neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
elog(ERROR, "unexpected neon message tag 0x%02x", tag);
break;
}
@@ -1277,7 +1277,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
XLogFlush(recptr);
lsn = recptr;
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1305,7 +1305,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
if (PageIsNew((Page) buffer))
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1313,7 +1313,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else if (PageIsEmptyHeapPage((Page) buffer))
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1321,7 +1321,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else
{
ereport(PANIC,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1330,7 +1330,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1430,7 +1430,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
lsn = nm_adjust_lsn(lsn);
neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
(uint32) ((lsn) >> 32), (uint32) (lsn));
}
else
@@ -1445,7 +1445,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
*latest = true;
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
Assert(lsn != InvalidXLogRecPtr);
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
(uint32) ((lsn) >> 32), (uint32) (lsn));
lsn = nm_adjust_lsn(lsn);
@@ -1465,7 +1465,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
#endif
if (lsn > flushlsn)
{
neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
(uint32) (lsn >> 32), (uint32) lsn,
(uint32) (flushlsn >> 32), (uint32) flushlsn);
XLogFlush(lsn);
@@ -1509,7 +1509,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return mdexists(reln, forkNum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
@@ -1561,7 +1561,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -1570,7 +1570,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
pfree(resp);
return exists;
@@ -1587,7 +1587,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1598,10 +1598,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_log(SmgrTrace, "Create relation %u/%u/%u.%u",
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum);
@@ -1696,7 +1696,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1707,7 +1707,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/*
@@ -1745,7 +1745,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum, blkno,
(uint32) (lsn >> 32), (uint32) lsn);
@@ -1785,7 +1785,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1796,7 +1796,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (max_cluster_size > 0 &&
@@ -1808,7 +1808,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
ereport(ERROR,
(errcode(ERRCODE_DISK_FULL),
errmsg("could not extend file because project size limit (%d MB) has been exceeded",
errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
max_cluster_size),
errhint("This limit is defined by neon.max_cluster_size GUC")));
}
@@ -1821,7 +1821,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks",
errmsg("cannot extend file \"%s\" beyond %u blocks",
relpath(reln->smgr_rlocator, forkNum),
InvalidBlockNumber)));
@@ -1882,7 +1882,7 @@ neon_open(SMgrRelation reln)
mdopen(reln);
/* no work */
neon_log(SmgrTrace, "open noop");
elog(SmgrTrace, "[NEON_SMGR] open noop");
}
/*
@@ -1919,7 +1919,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
return mdprefetch(reln, forknum, blocknum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
@@ -1964,11 +1964,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* not implemented */
neon_log(SmgrTrace, "writeback noop");
elog(SmgrTrace, "[NEON_SMGR] writeback noop");
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -2098,7 +2098,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
blkno,
RelFileInfoFmt(rinfo),
forkNum,
@@ -2107,7 +2107,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
((NeonErrorResponse *) resp)->message)));
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
/* buffer was used, clean up for later reuse */
@@ -2131,7 +2131,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -2142,7 +2142,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* Try to read from local file cache */
@@ -2170,7 +2170,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
{
if (!PageIsNew((Page) pageserver_masked))
{
neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2180,7 +2180,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
}
else if (PageIsNew((Page) buffer))
{
neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2195,7 +2195,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
{
neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2214,7 +2214,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
{
neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2294,13 +2294,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_wallog_page(reln, forknum, blocknum, buffer, false);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
@@ -2327,7 +2327,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2338,12 +2338,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
return mdnblocks(reln, forknum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
@@ -2371,7 +2371,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -2380,11 +2380,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
@@ -2427,7 +2427,7 @@ neon_dbsize(Oid dbNode)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
errdetail("page server returned error: %s",
@@ -2435,10 +2435,10 @@ neon_dbsize(Oid dbNode)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
db_size);
@@ -2458,7 +2458,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2470,7 +2470,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
@@ -2526,7 +2526,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2538,10 +2538,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
elog(SmgrTrace, "[NEON_SMGR] immedsync noop");
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -2566,17 +2566,17 @@ neon_start_unlogged_build(SMgrRelation reln)
* progress at a time. That's enough for the current usage.
*/
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
neon_log(ERROR, "unlogged relation build is already in progress");
elog(ERROR, "unlogged relation build is already in progress");
Assert(unlogged_build_rel == NULL);
ereport(SmgrTrace,
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
(errmsg("starting unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2589,11 +2589,11 @@ neon_start_unlogged_build(SMgrRelation reln)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
elog(ERROR, "cannot perform unlogged index build, index is not empty ");
unlogged_build_rel = reln;
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
@@ -2620,7 +2620,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2649,7 +2649,7 @@ neon_end_unlogged_build(SMgrRelation reln)
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
(errmsg("ending unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2664,7 +2664,7 @@ neon_end_unlogged_build(SMgrRelation reln)
rinfob = InfoBFromSMgrRel(reln);
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
forknum);
@@ -2707,7 +2707,7 @@ AtEOXact_neon(XactEvent event, void *arg)
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));
(errmsg("unlogged index build was not properly finished"))));
}
break;
}
@@ -2806,14 +2806,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
set_cached_relsize(rinfo, forknum, relsize);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
neon_log(SmgrTrace, "Set length to %d", relsize);
elog(SmgrTrace, "Set length to %d", relsize);
}
}
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
/*
* TODO: May be it is better to make correspondent function from freespace.c public?
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
*/
static BlockNumber
get_fsm_physical_block(BlockNumber heapblk)
@@ -2894,7 +2894,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
#if PG_VERSION_NUM < 150000
if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
neon_log(PANIC, "failed to locate backup block with ID %d", block_id);
elog(PANIC, "failed to locate backup block with ID %d", block_id);
#else
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
#endif

View File

@@ -959,8 +959,8 @@ DetermineEpochStartLsn(WalProposer *wp)
}
/*
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
* and nothing was committed yet. Start streaming then from the basebackup LSN.
* If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
* was committed yet. Start streaming then from the basebackup LSN.
*/
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
{
@@ -973,13 +973,12 @@ DetermineEpochStartLsn(WalProposer *wp)
}
/*
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
* should never be zero at this point, if we know timelineStartLsn.
*
* timelineStartLsn can be zero only on the first syncSafekeepers run.
* If propEpochStartLsn is not 0, at least one msg with WAL was sent to
* some connected safekeeper; it must have carried truncateLsn pointing to
* the first record.
*/
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
(wp->config->syncSafekeepers && wp->truncateLsn == wp->propEpochStartLsn));
/*
* We will be generating WAL since propEpochStartLsn, so we should set

View File

@@ -89,4 +89,3 @@ camino-tempfile.workspace = true
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true
walkdir.workspace = true

View File

@@ -32,7 +32,6 @@ pub struct RequestMonitoring {
user: Option<SmolStr>,
application: Option<SmolStr>,
error_kind: Option<ErrorKind>,
success: bool,
// extra
// This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -60,7 +59,6 @@ impl RequestMonitoring {
user: None,
application: None,
error_kind: None,
success: false,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol),
@@ -98,10 +96,6 @@ impl RequestMonitoring {
self.user = Some(user);
}
pub fn set_success(&mut self) {
self.success = true;
}
pub fn log(&mut self) {
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(self.clone());

View File

@@ -1,8 +1,7 @@
use std::{sync::Arc, time::SystemTime};
use std::sync::Arc;
use anyhow::Context;
use bytes::BytesMut;
use chrono::{Datelike, Timelike};
use futures::{Stream, StreamExt};
use parquet::{
basic::Compression,
@@ -87,12 +86,6 @@ struct RequestData {
project: Option<String>,
branch: Option<String>,
error: Option<&'static str>,
/// Success is counted if we form a HTTP response with sql rows inside
/// Or if we make it to proxy_pass
success: bool,
/// Tracks time from session start (HTTP request/libpq TCP handshake)
/// Through to success/failure
duration_us: u64,
}
impl From<RequestMonitoring> for RequestData {
@@ -109,11 +102,6 @@ impl From<RequestMonitoring> for RequestData {
protocol: value.protocol,
region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_str()),
success: value.success,
duration_us: SystemTime::from(value.first_packet)
.elapsed()
.unwrap_or_default()
.as_micros() as u64, // 584 millenia... good enough
}
}
}
@@ -278,13 +266,7 @@ async fn upload_parquet(
let compression = len as f64 / len_uncompressed as f64;
let size = data.len();
let now = chrono::Utc::now();
let id = uuid::Uuid::new_v7(uuid::Timestamp::from_unix(
uuid::NoContext,
// we won't be running this in 1970. this cast is ok
now.timestamp() as u64,
now.timestamp_subsec_nanos(),
));
let id = uuid::Uuid::now_v7();
info!(
%id,
@@ -292,14 +274,7 @@ async fn upload_parquet(
size, compression, "uploading request parquet file"
);
let year = now.year();
let month = now.month();
let day = now.day();
let hour = now.hour();
// segment files by time for S3 performance
let path = RemotePath::from_string(&format!(
"{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
))?;
let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?;
backoff::retry(
|| async {
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
@@ -357,7 +332,6 @@ mod tests {
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
use tokio::{sync::mpsc, time};
use walkdir::WalkDir;
use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
@@ -446,8 +420,6 @@ mod tests {
protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
region: "us-east-1",
error: None,
success: rng.gen(),
duration_us: rng.gen_range(0..30_000_000),
}
}
@@ -470,11 +442,9 @@ mod tests {
worker_inner(storage, rx, config).await.unwrap();
let mut files = WalkDir::new(tmpdir.as_std_path())
.into_iter()
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.map(|entry| entry.path().to_path_buf())
let mut files = std::fs::read_dir(tmpdir.as_std_path())
.unwrap()
.map(|entry| entry.unwrap().path())
.collect_vec();
files.sort();
@@ -515,15 +485,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1087635, 3, 6000),
(1087288, 3, 6000),
(1087444, 3, 6000),
(1087572, 3, 6000),
(1087468, 3, 6000),
(1087500, 3, 6000),
(1087533, 3, 6000),
(1087566, 3, 6000),
(362671, 1, 2000)
(1029153, 3, 6000),
(1029075, 3, 6000),
(1029216, 3, 6000),
(1029129, 3, 6000),
(1029250, 3, 6000),
(1029017, 3, 6000),
(1029175, 3, 6000),
(1029247, 3, 6000),
(343124, 1, 2000)
],
);
@@ -553,11 +523,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1028637, 5, 10000),
(1031969, 5, 10000),
(1019900, 5, 10000),
(1020365, 5, 10000),
(1025010, 5, 10000)
(1166201, 6, 12000),
(1163577, 6, 12000),
(1164641, 6, 12000),
(1168772, 6, 12000),
(196761, 1, 2000)
],
);
@@ -589,11 +559,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1210770, 6, 12000),
(1211036, 6, 12000),
(1210990, 6, 12000),
(1210861, 6, 12000),
(202073, 1, 2000)
(1144934, 6, 12000),
(1144941, 6, 12000),
(1144735, 6, 12000),
(1144936, 6, 12000),
(191035, 1, 2000)
],
);
@@ -618,15 +588,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1087635, 3, 6000),
(1087288, 3, 6000),
(1087444, 3, 6000),
(1087572, 3, 6000),
(1087468, 3, 6000),
(1087500, 3, 6000),
(1087533, 3, 6000),
(1087566, 3, 6000),
(362671, 1, 2000)
(1029153, 3, 6000),
(1029075, 3, 6000),
(1029216, 3, 6000),
(1029129, 3, 6000),
(1029250, 3, 6000),
(1029017, 3, 6000),
(1029175, 3, 6000),
(1029247, 3, 6000),
(343124, 1, 2000)
],
);
@@ -663,7 +633,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
[(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)],
);
tmpdir.close().unwrap();

View File

@@ -356,7 +356,6 @@ pub async fn proxy_pass(
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: MetricsAuxInfo,
) -> anyhow::Result<()> {
ctx.set_success();
ctx.log();
let usage = USAGE_METRICS.register(Ids {

View File

@@ -46,11 +46,14 @@ enum Notification {
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
struct AllowedIpsUpdate {
#[serde(rename = "project")]
project_id: SmolStr,
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
struct PasswordUpdate {
#[serde(rename = "project")]
project_id: SmolStr,
#[serde(rename = "role")]
role_name: SmolStr,
}
fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
@@ -148,7 +151,7 @@ mod tests {
#[test]
fn parse_allowed_ips() -> anyhow::Result<()> {
let project_id = "new_project".to_string();
let data = format!("{{\"project_id\": \"{project_id}\"}}");
let data = format!("{{\"project\": \"{project_id}\"}}");
let text = json!({
"type": "message",
"topic": "/allowed_ips_updated",
@@ -174,7 +177,7 @@ mod tests {
fn parse_password_updated() -> anyhow::Result<()> {
let project_id = "new_project".to_string();
let role_name = "new_role".to_string();
let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
let data = format!("{{\"project\": \"{project_id}\", \"role\": \"{role_name}\"}}");
let text = json!({
"type": "message",
"topic": "/password_updated",

View File

@@ -26,7 +26,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
console::{self, messages::MetricsAuxInfo},
console,
context::RequestMonitoring,
metrics::NUM_DB_CONNECTIONS_GAUGE,
proxy::connect_compute::ConnectMechanism,
@@ -362,7 +362,6 @@ impl GlobalConnPool {
// ok return cached connection if found and establish a new one otherwise
let new_client = if let Some(client) = client {
ctx.set_project(client.aux.clone());
if client.inner.is_closed() {
let conn_id = uuid::Uuid::new_v4();
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
@@ -594,6 +593,10 @@ async fn connect_to_compute_once(
span.in_scope(|| {
info!(%conn_info, %session, "new connection");
});
let ids = Ids {
endpoint_id: node_info.aux.endpoint_id.clone(),
branch_id: node_info.aux.branch_id.clone(),
};
let db_user = conn_info.db_and_user();
tokio::spawn(
@@ -661,7 +664,7 @@ async fn connect_to_compute_once(
Ok(ClientInner {
inner: client,
session: tx,
aux: node_info.aux.clone(),
ids,
conn_id,
})
}
@@ -669,17 +672,13 @@ async fn connect_to_compute_once(
struct ClientInner {
inner: tokio_postgres::Client,
session: tokio::sync::watch::Sender<uuid::Uuid>,
aux: MetricsAuxInfo,
ids: Ids,
conn_id: uuid::Uuid,
}
impl Client {
pub fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux;
USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
})
USAGE_METRICS.register(self.inner.as_ref().unwrap().ids.clone())
}
}

View File

@@ -497,7 +497,6 @@ async fn handle_inner(
}
};
ctx.set_success();
ctx.log();
let metrics = client.metrics();

View File

@@ -8,8 +8,6 @@ use futures::future::BoxFuture;
use futures::stream::FuturesUnordered;
use futures::{FutureExt, StreamExt};
use remote_storage::RemoteStorageConfig;
use safekeeper::control_file::FileStorage;
use safekeeper::state::TimelinePersistentState;
use sd_notify::NotifyState;
use tokio::runtime::Handle;
use tokio::signal::unix::{signal, SignalKind};
@@ -32,12 +30,12 @@ use safekeeper::defaults::{
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
DEFAULT_PG_LISTEN_ADDR,
};
use safekeeper::wal_service;
use safekeeper::GlobalTimelines;
use safekeeper::SafeKeeperConf;
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
use safekeeper::{control_file, BROKER_RUNTIME};
use safekeeper::{http, WAL_REMOVER_RUNTIME};
use safekeeper::{json_merge, wal_service};
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
use safekeeper::{wal_backup, HTTP_RUNTIME};
use storage_broker::DEFAULT_ENDPOINT;
@@ -107,6 +105,9 @@ struct Args {
/// Do not wait for changes to be written safely to disk. Unsafe.
#[arg(short, long)]
no_sync: bool,
/// Dump control file at path specified by this argument and exit.
#[arg(long)]
dump_control_file: Option<Utf8PathBuf>,
/// Broker endpoint for storage nodes coordination in the form
/// http[s]://host:port. In case of https schema TLS is connection is
/// established; plaintext otherwise.
@@ -165,21 +166,6 @@ struct Args {
/// useful for debugging.
#[arg(long)]
current_thread_runtime: bool,
/// Dump control file at path specified by this argument and exit.
#[arg(long)]
dump_control_file: Option<Utf8PathBuf>,
/// Patch control file at path specified by this argument and exit.
/// Patch is specified in --patch option and imposed over
/// control file as per rfc7386.
/// Without --write-patched the result is only printed.
#[arg(long, verbatim_doc_comment)]
patch_control_file: Option<Utf8PathBuf>,
/// The patch to apply to control file at --patch-control-file, in JSON.
#[arg(long, default_value = None)]
patch: Option<String>,
/// Write --patch-control-file result back in place.
#[arg(long, default_value = "false")]
write_patched: bool,
}
// Like PathBufValueParser, but allows empty string.
@@ -221,13 +207,7 @@ async fn main() -> anyhow::Result<()> {
if let Some(addr) = args.dump_control_file {
let state = control_file::FileStorage::load_control_file(addr)?;
let json = serde_json::to_string(&state)?;
println!("{json}");
return Ok(());
}
if let Some(cfile_path) = args.patch_control_file {
let patch = args.patch.ok_or(anyhow::anyhow!("patch is missing"))?;
patch_control_file(cfile_path, patch, args.write_patched).await?;
print!("{json}");
return Ok(());
}
@@ -549,26 +529,6 @@ fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfi
})
}
async fn patch_control_file(
cfile_path: Utf8PathBuf,
patch: String,
write: bool,
) -> anyhow::Result<()> {
let state = control_file::FileStorage::load_control_file(&cfile_path)?;
// serialize to json, impose patch and deserialize back
let mut state_json =
serde_json::to_value(state).context("failed to serialize state to json")?;
let patch_json = serde_json::from_str(&patch).context("failed to parse patch")?;
json_merge(&mut state_json, patch_json);
let patched_state: TimelinePersistentState =
serde_json::from_value(state_json.clone()).context("failed to deserialize patched json")?;
println!("{state_json}");
if write {
FileStorage::do_persist(&patched_state, &cfile_path, true).await?;
}
return Ok(());
}
#[test]
fn verify_cli() {
use clap::CommandFactory;

View File

@@ -2,7 +2,7 @@
use anyhow::{bail, ensure, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8PathBuf;
use tokio::fs::{self, File};
use tokio::io::AsyncWriteExt;
@@ -155,46 +155,6 @@ impl FileStorage {
})?;
Ok(state)
}
/// Persist state s to dst_path, optionally fsyncing file.
pub async fn do_persist(
s: &TimelinePersistentState,
dst_path: &Utf8Path,
sync: bool,
) -> Result<()> {
let mut f = File::create(&dst_path)
.await
.with_context(|| format!("failed to create partial control file at: {}", &dst_path))?;
let mut buf: Vec<u8> = Vec::new();
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
// calculate checksum before resize
let checksum = crc32c::crc32c(&buf);
buf.extend_from_slice(&checksum.to_le_bytes());
f.write_all(&buf).await.with_context(|| {
format!(
"failed to write safekeeper state into control file at: {}",
dst_path
)
})?;
f.flush().await.with_context(|| {
format!(
"failed to flush safekeeper state into control file at: {}",
dst_path
)
})?;
// fsync the file
if sync {
f.sync_all()
.await
.with_context(|| format!("failed to sync partial control file at {}", dst_path))?;
}
Ok(())
}
}
impl Deref for FileStorage {
@@ -207,7 +167,7 @@ impl Deref for FileStorage {
#[async_trait::async_trait]
impl Storage for FileStorage {
/// Atomically persists state durably to the underlying storage.
/// Persists state durably to the underlying storage.
///
/// For a description, see <https://lwn.net/Articles/457667/>.
async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
@@ -215,9 +175,46 @@ impl Storage for FileStorage {
// write data to safekeeper.control.partial
let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
FileStorage::do_persist(s, &control_partial_path, !self.conf.no_sync).await?;
let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
format!(
"failed to create partial control file at: {}",
&control_partial_path
)
})?;
let mut buf: Vec<u8> = Vec::new();
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
// calculate checksum before resize
let checksum = crc32c::crc32c(&buf);
buf.extend_from_slice(&checksum.to_le_bytes());
control_partial.write_all(&buf).await.with_context(|| {
format!(
"failed to write safekeeper state into control file at: {}",
control_partial_path
)
})?;
control_partial.flush().await.with_context(|| {
format!(
"failed to flush safekeeper state into control file at: {}",
control_partial_path
)
})?;
// fsync the file
if !self.conf.no_sync {
control_partial.sync_all().await.with_context(|| {
format!(
"failed to sync partial control file at {}",
control_partial_path
)
})?;
}
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
// rename should be atomic
fs::rename(&control_partial_path, &control_path).await?;
// this sync is not required by any standard but postgres does this (see durable_rename)

View File

@@ -288,32 +288,34 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
}
/// Deactivates the timeline and removes its data directory.
async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
async fn timeline_delete_force_handler(
mut request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let ttid = TenantTimelineId::new(
parse_request_param(&request, "tenant_id")?,
parse_request_param(&request, "timeline_id")?,
);
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
check_permission(&request, Some(ttid.tenant_id))?;
ensure_no_body(&mut request).await?;
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
// error handling here when we're able to.
let resp = GlobalTimelines::delete(&ttid, only_local)
let resp = GlobalTimelines::delete_force(&ttid)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, resp)
}
/// Deactivates all timelines for the tenant and removes its data directory.
/// See `timeline_delete_handler`.
async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
/// See `timeline_delete_force_handler`.
async fn tenant_delete_force_handler(
mut request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id = parse_request_param(&request, "tenant_id")?;
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
check_permission(&request, Some(tenant_id))?;
ensure_no_body(&mut request).await?;
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
// Using an `InternalServerError` should be fixed when the types support it
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
.await
.map_err(ApiError::InternalServerError)?;
json_response(
@@ -510,10 +512,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
request_span(r, timeline_status_handler)
})
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
request_span(r, timeline_delete_handler)
request_span(r, timeline_delete_force_handler)
})
.delete("/v1/tenant/:tenant_id", |r| {
request_span(r, tenant_delete_handler)
request_span(r, tenant_delete_force_handler)
})
.post("/v1/pull_timeline", |r| {
request_span(r, timeline_pull_handler)

View File

@@ -2,7 +2,6 @@
use camino::Utf8PathBuf;
use once_cell::sync::Lazy;
use remote_storage::RemoteStorageConfig;
use serde_json::Value;
use tokio::runtime::Runtime;
use std::time::Duration;
@@ -89,10 +88,6 @@ impl SafeKeeperConf {
self.tenant_dir(&ttid.tenant_id)
.join(ttid.timeline_id.to_string())
}
pub fn is_wal_backup_enabled(&self) -> bool {
self.remote_storage.is_some() && self.wal_backup_enabled
}
}
impl SafeKeeperConf {
@@ -176,24 +171,3 @@ pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
.build()
.expect("Failed to create broker runtime")
});
/// Merge json b into json a according to
/// https://www.rfc-editor.org/rfc/rfc7396
/// https://stackoverflow.com/a/54118457/4014587
pub fn json_merge(a: &mut Value, b: Value) {
if let Value::Object(a) = a {
if let Value::Object(b) = b {
for (k, v) in b {
if v.is_null() {
a.remove(&k);
} else {
json_merge(a.entry(k).or_insert(Value::Null), v);
}
}
return;
}
}
*a = b;
}

View File

@@ -742,11 +742,6 @@ where
state.timeline_start_lsn
);
}
if state.peer_horizon_lsn == Lsn(0) {
// Update peer_horizon_lsn as soon as we know where timeline starts.
// It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
state.peer_horizon_lsn = msg.timeline_start_lsn;
}
if state.local_start_lsn == Lsn(0) {
state.local_start_lsn = msg.start_streaming_at;
info!("setting local_start_lsn to {:?}", state.local_start_lsn);

View File

@@ -407,7 +407,7 @@ impl SafekeeperPostgresHandler {
self.conf.timeline_dir(&tli.ttid),
&persisted_state,
start_pos,
self.conf.is_wal_backup_enabled(),
self.conf.wal_backup_enabled,
)?;
// Split to concurrently receive and send data; replies are generally

View File

@@ -33,13 +33,12 @@ use crate::safekeeper::{
};
use crate::send_wal::WalSenders;
use crate::state::{TimelineMemState, TimelinePersistentState};
use crate::wal_backup::{self};
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
use crate::metrics::FullTimelineInfo;
use crate::wal_storage::Storage as wal_storage_iface;
use crate::SafeKeeperConf;
use crate::{debug_dump, wal_storage};
use crate::{GlobalTimelines, SafeKeeperConf};
/// Things safekeeper should know about timeline state on peers.
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -472,29 +471,14 @@ impl Timeline {
}
}
/// Delete timeline from disk completely, by removing timeline directory.
/// Background timeline activities will stop eventually.
///
/// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but
/// deletion API endpoint is retriable.
pub async fn delete(
/// Delete timeline from disk completely, by removing timeline directory. Background
/// timeline activities will stop eventually.
pub async fn delete_from_disk(
&self,
shared_state: &mut MutexGuard<'_, SharedState>,
only_local: bool,
) -> Result<(bool, bool)> {
let was_active = shared_state.active;
self.cancel(shared_state);
// TODO: It's better to wait for s3 offloader termination before
// removing data from s3. Though since s3 doesn't have transactions it
// still wouldn't guarantee absense of data after removal.
let conf = GlobalTimelines::get_global_config();
if !only_local && conf.is_wal_backup_enabled() {
// Note: we concurrently delete remote storage data from multiple
// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
// do some retries anyway.
wal_backup::delete_timeline(&self.ttid).await?;
}
let dir_existed = delete_dir(&self.timeline_dir).await?;
Ok((dir_existed, was_active))
}

View File

@@ -327,20 +327,16 @@ impl GlobalTimelines {
}
/// Cancels timeline, then deletes the corresponding data directory.
/// If only_local, doesn't remove WAL segments in remote storage.
pub async fn delete(
ttid: &TenantTimelineId,
only_local: bool,
) -> Result<TimelineDeleteForceResult> {
pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
match tli_res {
Ok(timeline) => {
// Take a lock and finish the deletion holding this mutex.
let mut shared_state = timeline.write_shared_state().await;
info!("deleting timeline {}, only_local={}", ttid, only_local);
info!("deleting timeline {}", ttid);
let (dir_existed, was_active) =
timeline.delete(&mut shared_state, only_local).await?;
timeline.delete_from_disk(&mut shared_state).await?;
// Remove timeline from the map.
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -373,11 +369,8 @@ impl GlobalTimelines {
/// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
/// created simultaneously. In that case the function will return error and the caller should
/// retry tenant deletion again later.
///
/// If only_local, doesn't remove WAL segments in remote storage.
pub async fn delete_force_all_for_tenant(
tenant_id: &TenantId,
only_local: bool,
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
info!("deleting all timelines for tenant {}", tenant_id);
let to_delete = Self::get_all_for_tenant(*tenant_id);
@@ -386,7 +379,7 @@ impl GlobalTimelines {
let mut deleted = HashMap::new();
for tli in &to_delete {
match Self::delete(&tli.ttid, only_local).await {
match Self::delete_force(&tli.ttid).await {
Ok(result) => {
deleted.insert(tli.ttid, result);
}

View File

@@ -4,8 +4,6 @@ use camino::{Utf8Path, Utf8PathBuf};
use futures::stream::FuturesOrdered;
use futures::StreamExt;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use utils::backoff;
use utils::id::NodeId;
use std::cmp::min;
@@ -168,17 +166,6 @@ async fn update_task(
}
}
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
// Storage must be configured and initialized when this is called.
fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
REMOTE_STORAGE
.get()
.expect("failed to get remote storage")
.as_ref()
.unwrap()
}
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
@@ -212,7 +199,7 @@ pub async fn wal_backup_launcher_task_main(
ttid = wal_backup_launcher_rx.recv() => {
// channel is never expected to get closed
let ttid = ttid.unwrap();
if !conf.is_wal_backup_enabled() {
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
continue; /* just drain the channel and do nothing */
}
async {
@@ -497,12 +484,18 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
res
}
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
async fn backup_object(
source_file: &Utf8Path,
target_file: &RemotePath,
size: usize,
) -> Result<()> {
let storage = get_configured_remote_storage();
let storage = REMOTE_STORAGE
.get()
.expect("failed to get remote storage")
.as_ref()
.unwrap();
let file = File::open(&source_file)
.await
@@ -539,39 +532,6 @@ pub async fn read_object(
Ok(Box::pin(reader))
}
/// Delete WAL files for the given timeline. Remote storage must be configured
/// when called.
pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
let storage = get_configured_remote_storage();
let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
let remote_path = RemotePath::new(&ttid_path)?;
// A backoff::retry is used here for two reasons:
// - To provide a backoff rather than busy-polling the API on errors
// - To absorb transient 429/503 conditions without hitting our error
// logging path for issues deleting objects.
//
// Note: listing segments might take a long time if there are many of them.
// We don't currently have http requests timeout cancellation, but if/once
// we have listing should get streaming interface to make progress.
let token = CancellationToken::new(); // not really used
backoff::retry(
|| async {
let files = storage.list_files(Some(&remote_path)).await?;
storage.delete_objects(&files).await?;
Ok(())
},
|_| false,
3,
10,
"executing WAL segments deletion batch",
backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
)
.await?;
Ok(())
}
/// Copy segments from one timeline to another. Used in copy_timeline.
pub async fn copy_s3_segments(
wal_seg_size: usize,

View File

@@ -12,11 +12,9 @@ from pathlib import Path
# Type-related stuff
from typing import Callable, ClassVar, Dict, Iterator, Optional
import allure
import pytest
from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.fixtures import FixtureRequest
from _pytest.terminal import TerminalReporter
from fixtures.log_helper import log
@@ -413,10 +411,7 @@ class NeonBenchmarker:
@pytest.fixture(scope="function")
def zenbenchmark(
request: FixtureRequest,
record_property: Callable[[str, object], None],
) -> Iterator[NeonBenchmarker]:
def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]:
"""
This is a python decorator for benchmark fixtures. It contains functions for
recording measurements, and prints them out at the end.
@@ -424,21 +419,6 @@ def zenbenchmark(
benchmarker = NeonBenchmarker(record_property)
yield benchmarker
results = {}
for _, recorded_property in request.node.user_properties:
name = recorded_property["name"]
value = str(recorded_property["value"])
if (unit := recorded_property["unit"].strip()) != "":
value += f" {unit}"
results[name] = value
content = json.dumps(results, indent=2)
allure.attach(
content,
"benchmarks.json",
allure.attachment_type.JSON,
)
def pytest_addoption(parser: Parser):
parser.addoption(

View File

@@ -3352,15 +3352,9 @@ class SafekeeperHttpClient(requests.Session):
)
res.raise_for_status()
# only_local doesn't remove segments in the remote storage.
def timeline_delete(
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
) -> Dict[Any, Any]:
def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]:
res = self.delete(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
params={
"only_local": str(only_local).lower(),
},
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
)
res.raise_for_status()
res_json = res.json()

View File

@@ -1,11 +1,11 @@
import time
from typing import Any, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
from fixtures.log_helper import log
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
from fixtures.remote_storage import RemoteStorageKind, S3Storage
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.utils import wait_until
@@ -233,18 +233,23 @@ def timeline_delete_wait_completed(
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
# remote_storage must not be None, but that's easier for callers to make mypy happy
if TYPE_CHECKING:
# TODO avoid by combining remote storage related stuff in single type
# and just passing in this type instead of whole builder
from fixtures.neon_fixtures import NeonEnvBuilder
def assert_prefix_empty(
remote_storage: Optional[RemoteStorage],
neon_env_builder: "NeonEnvBuilder",
prefix: Optional[str] = None,
allowed_postfix: Optional[str] = None,
):
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
response = list_prefix(neon_env_builder, prefix)
keys = response["KeyCount"]
objects: List[ObjectTypeDef] = response.get("Contents", [])
common_prefixes = response.get("CommonPrefixes", [])
remote_storage = neon_env_builder.pageserver_remote_storage
is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup
if is_mock_s3:
@@ -278,20 +283,19 @@ def assert_prefix_empty(
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
# remote_storage must not be None, but that's easier for callers to make mypy happy
def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
response = list_prefix(neon_env_builder, prefix)
assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
def list_prefix(
remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/"
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
) -> ListObjectsV2OutputTypeDef:
"""
Note that this function takes into account prefix_in_bucket.
"""
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
remote = neon_env_builder.pageserver_remote_storage
assert isinstance(remote, S3Storage), "localfs is currently not supported"
assert remote.client is not None

View File

@@ -1,6 +1,5 @@
import enum
import time
from collections import Counter
from dataclasses import dataclass
from typing import Any, Dict, Tuple
@@ -120,19 +119,6 @@ class EvictionEnv:
for tid, tlid in self.timelines
}
def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]:
ret: Counter[TenantId] = Counter()
for tenant_id, timeline_id in self.timelines:
timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
assert timeline_dir.exists()
for file in timeline_dir.iterdir():
if "__" not in file.name:
continue
ret[tenant_id] += 1
return dict(ret)
def warm_up_tenant(self, tenant_id: TenantId):
"""
Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it.
@@ -517,7 +503,6 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
(total_on_disk, _, _) = env.timelines_du(env.pageserver)
du_by_timeline = env.du_by_timeline(env.pageserver)
tenant_layers = env.count_layers_per_tenant(env.pageserver)
# pick smaller or greater (iteration order is insertion order of scale=4 and scale=6)
[warm, cold] = list(du_by_timeline.keys())
@@ -571,31 +556,8 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
cold_size < cold_upper
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
else:
# with relative order what matters is the amount of layers, with a
# fudge factor of whether the eviction bothers tenants with highest
# layer count the most. last accessed times between tenants does not
# matter.
layers_now = env.count_layers_per_tenant(env.pageserver)
expected_ratio = later_total_on_disk / total_on_disk
log.info(
f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
)
for tenant_id, original_count in tenant_layers.items():
count_now = layers_now[tenant_id]
ratio = count_now / original_count
abs_diff = abs(ratio - expected_ratio)
assert original_count > count_now
log.info(
f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1"
)
# in this test case both relative_spare and relative_equal produce
# the same outcomes; this must be a quantization effect of similar
# sizes (-s4 and -s6) and small (5MB) layer size.
# for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
assert abs_diff < 0.05
# just go with the space was freed, find proper limits later
pass
def poor_mans_du(

View File

@@ -216,14 +216,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
log.info(f"group: {m.group(1)}")
return int(m.group(1), 16)
assert neon_env_builder.pageserver_remote_storage is not None
pre_upgrade_keys = list(
[
o["Key"]
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
"Contents"
]
]
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in pre_upgrade_keys:
assert parse_generation_suffix(key) is None
@@ -238,12 +232,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
legacy_objects: list[str] = []
suffixed_objects = []
post_upgrade_keys = list(
[
o["Key"]
for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
"Contents"
]
]
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in post_upgrade_keys:
log.info(f"post-upgrade key: {key}")

View File

@@ -504,7 +504,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",

View File

@@ -75,7 +75,7 @@ def test_tenant_delete_smoke(
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -96,7 +96,7 @@ def test_tenant_delete_smoke(
assert not tenant_path.exists()
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -207,7 +207,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -268,7 +268,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# Check remote is empty
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -304,7 +304,7 @@ def test_tenant_delete_is_resumed_on_attach(
# sanity check, data should be there
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -343,7 +343,7 @@ def test_tenant_delete_is_resumed_on_attach(
)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -378,7 +378,7 @@ def test_tenant_delete_is_resumed_on_attach(
ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -543,7 +543,7 @@ def test_tenant_delete_concurrent(
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -556,6 +556,216 @@ def test_tenant_delete_concurrent(
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
def test_tenant_delete_races_timeline_creation_01(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_02(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_03(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_04(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_05(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_06(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_07(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_08(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_09(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_10(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_11(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_12(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_13(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_14(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_15(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_16(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_17(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_18(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_19(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_20(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_21(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_22(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_23(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_24(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_25(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_26(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_27(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_28(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_29(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_30(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
@@ -645,7 +855,7 @@ def test_tenant_delete_races_timeline_creation(
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",

View File

@@ -191,7 +191,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -275,7 +275,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
# Check remote is empty
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
assert all([tl["state"] == "Active" for tl in timelines])
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -466,7 +466,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -482,7 +482,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
wait_until(
2,
0.5,
lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
lambda: assert_prefix_empty(neon_env_builder),
)
@@ -673,7 +673,7 @@ def test_timeline_delete_works_for_remote_smoke(
for timeline_id in timeline_ids:
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -690,7 +690,7 @@ def test_timeline_delete_works_for_remote_smoke(
timeline_delete_wait_completed(ps_http, tenant_id=tenant_id, timeline_id=timeline_id)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -703,7 +703,7 @@ def test_timeline_delete_works_for_remote_smoke(
# for some reason the check above doesnt immediately take effect for the below.
# Assume it is mock server inconsistency and check twice.
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder))
def test_delete_orphaned_objects(
@@ -791,7 +791,7 @@ def test_timeline_delete_resumed_on_attach(
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -839,7 +839,7 @@ def test_timeline_delete_resumed_on_attach(
assert reason.endswith(f"failpoint: {failpoint}"), reason
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",
@@ -870,7 +870,7 @@ def test_timeline_delete_resumed_on_attach(
assert not tenant_path.exists()
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
neon_env_builder,
prefix="/".join(
(
"tenants",

View File

@@ -33,19 +33,13 @@ from fixtures.neon_fixtures import (
last_flush_lsn_upload,
)
from fixtures.pageserver.utils import (
assert_prefix_empty,
assert_prefix_not_empty,
timeline_delete_wait_completed,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pg_version import PgVersion
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import (
RemoteStorageKind,
default_remote_storage,
s3_storage,
)
from fixtures.remote_storage import RemoteStorageKind, default_remote_storage
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import get_dir_size, query_scalar, start_in_background
@@ -124,8 +118,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
with env.pageserver.http_client() as pageserver_http:
timeline_details = [
pageserver_http.timeline_detail(
tenant_id=tenant_id,
timeline_id=branch_names_to_timeline_ids[branch_name],
tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]
)
for branch_name in branch_names
]
@@ -464,19 +457,10 @@ def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId,
def test_wal_backup(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
remote_storage_kind = s3_storage()
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
env = neon_env_builder.init_start()
# These are expected after timeline deletion on safekeepers.
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was not found in global map.*",
".*Timeline .* was cancelled and cannot be used anymore.*",
]
)
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_backup")
endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
@@ -504,8 +488,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
# put one of safekeepers down again
env.safekeepers[0].stop()
# restart postgres
endpoint.stop()
endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
endpoint.stop_and_destroy().create_start("test_safekeepers_wal_backup")
# and ensure offloading still works
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
@@ -515,17 +498,6 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
partial(is_segment_offloaded, env.safekeepers[1], tenant_id, timeline_id, seg_end),
f"segment ending at {seg_end} get offloaded",
)
env.safekeepers[0].start()
endpoint.stop()
# Test that after timeline deletion remote objects are gone.
prefix = "/".join([str(tenant_id), str(timeline_id)])
assert_prefix_not_empty(neon_env_builder.safekeepers_remote_storage, prefix)
for sk in env.safekeepers:
sk_http = sk.http_client()
sk_http.timeline_delete(tenant_id, timeline_id)
assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix)
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
@@ -614,7 +586,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
# advancing peer_horizon_lsn.
for sk in env.safekeepers:
cli = sk.http_client()
cli.timeline_delete(tenant_id, timeline_id, only_local=True)
cli.timeline_delete_force(tenant_id, timeline_id)
# restart safekeeper to clear its in-memory state
sk.stop()
# wait all potenital in flight pushes to broker arrive before starting
@@ -1651,7 +1623,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
endpoint_3.stop_and_destroy()
# Remove initial tenant's br1 (active)
assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1659,7 +1631,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
# Ensure repeated deletion succeeds
assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
assert not sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1670,13 +1642,13 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
# Ensure we cannot delete the other tenant
for sk_h in [sk_http, sk_http_noauth]:
with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
assert sk_h.timeline_delete(tenant_id_other, timeline_id_other)
assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other)
with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
assert sk_h.tenant_delete_force(tenant_id_other)
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
# Remove initial tenant's br2 (inactive)
assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"]
assert sk_http.timeline_delete_force(tenant_id, timeline_id_2)["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
@@ -1684,7 +1656,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
# Remove non-existing branch, should succeed
assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"]
assert not sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16))["dir_existed"]
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()

View File

@@ -6,7 +6,7 @@ commands:
sysvInitAction: sysinit
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
- name: pgbouncer
user: postgres
user: nobody
sysvInitAction: respawn
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
- name: postgres-exporter
@@ -36,9 +36,7 @@ files:
max_client_conn=10000
default_pool_size=64
max_prepared_statements=0
admin_users=postgres
unix_socket_dir=/tmp/
unix_socket_mode=0777
admin_users=cloud_admin
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -200,7 +198,7 @@ merge: |
RUN set -e \
&& chown postgres:postgres /etc/pgbouncer.ini \
&& chmod 0666 /etc/pgbouncer.ini \
&& chmod 0644 /etc/pgbouncer.ini \
&& chmod 0644 /etc/cgconfig.conf \
&& chmod 0644 /etc/sql_exporter.yml \
&& chmod 0644 /etc/neon_collector.yml