Compare commits

..

2 Commits

Author SHA1 Message Date
Vlad Lazar
750d65fa77 [WIP] Add logs for pg-regress 2025-04-25 16:38:08 +02:00
Em Sharnoff
2b0248cd76 fix(proxy): s/Console/Control plane/ in cplane error (#11716)
I got bamboozled by the error message while debugging, seems no
objections to updating it.

ref https://neondb.slack.com/archives/C060N3SEF9D/p1745570961111509

ref https://neondb.slack.com/archives/C039YKBRZB4/p1745570811957019?thread_ts=1745393368.283599
2025-04-25 11:09:56 +00:00
6 changed files with 146 additions and 78 deletions

View File

@@ -21,7 +21,7 @@ use pageserver_api::config::{
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::key::{Key, rel_block_to_key};
use pageserver_api::models::{
self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -29,7 +29,7 @@ use pageserver_api::models::{
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
PagestreamProtocolVersion, PagestreamRequest, TenantState,
};
use pageserver_api::reltag::SlruKind;
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::shard::TenantShardId;
use postgres_backend::{
AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
@@ -1035,10 +1035,10 @@ impl PageServerHandler {
// avoid a somewhat costly Span::record() by constructing the entire span in one go.
macro_rules! mkspan {
(before shard routing) => {{
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, not_modified_since_lsn = %req.hdr.not_modified_since)
}};
($shard_id:expr) => {{
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, not_modified_since_lsn = %req.hdr.not_modified_since, shard_id = %$shard_id)
}};
}
@@ -1140,9 +1140,10 @@ impl PageServerHandler {
.await?;
// We're holding the Handle
let last_record_lsn = shard.get_last_record_lsn();
let effective_request_lsn = match Self::effective_request_lsn(
&shard,
shard.get_last_record_lsn(),
last_record_lsn,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_applied_gc_cutoff_lsn(),
@@ -1153,6 +1154,22 @@ impl PageServerHandler {
}
};
let trouble_key = Key::from_hex("000000067F000040000000400600FFFFFFFF").unwrap();
let trouble_rel = RelTag {
spcnode: trouble_key.field2,
dbnode: trouble_key.field3,
relnode: trouble_key.field4,
forknum: trouble_key.field5,
};
if req.rel == trouble_rel {
tracing::info!(
request_lsn=%req.hdr.request_lsn,
not_modified_since_lsn=%req.hdr.not_modified_since,
%last_record_lsn,
"effective_request_lsn for {} is {}", key, effective_request_lsn
);
}
BatchedFeMessage::GetPage {
span,
shard: shard.downgrade(),

View File

@@ -185,6 +185,7 @@ impl Timeline {
pending_directory_entries: Vec::new(),
pending_metadata_bytes: 0,
lsn,
extra_log: false,
}
}
@@ -265,6 +266,14 @@ impl Timeline {
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
HashMap::with_capacity(pages.len());
let trouble_key = Key::from_hex("000000067F000040000000400600FFFFFFFF").unwrap();
let trouble_rel = RelTag {
spcnode: trouble_key.field2,
dbnode: trouble_key.field3,
relnode: trouble_key.field4,
forknum: trouble_key.field5,
};
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
if tag.relnode == 0 {
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
@@ -275,6 +284,14 @@ impl Timeline {
continue;
}
if *tag == trouble_rel {
tracing::info!(
"Getting rel size for {} at LSN {}",
rel_block_to_key(*tag, *blknum),
lsn
);
}
let nblocks = match self
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
@@ -1084,17 +1101,8 @@ impl Timeline {
let mut result = HashMap::new();
for (k, v) in kv {
let v = v?;
if v.is_empty() {
// This is a tombstone -- we can skip it.
// Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of
// the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone,
// we also need to consider that. Such tombstones might be written on the detach ancestor code path to
// avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.)
continue;
}
let origin_id = k.field6 as RepOriginId;
let origin_lsn = Lsn::des(&v)
.with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?;
let origin_lsn = Lsn::des(&v).unwrap();
if origin_lsn != Lsn::INVALID {
result.insert(origin_id, origin_lsn);
}
@@ -1411,6 +1419,8 @@ pub struct DatadirModification<'a> {
/// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
pending_metadata_bytes: usize,
extra_log: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -1629,6 +1639,32 @@ impl DatadirModification<'_> {
) -> Result<(), WalIngestError> {
let mut gaps_at_lsns = Vec::default();
let trouble_key = Key::from_hex("000000067F000040000000400600FFFFFFFF").unwrap();
let trouble_rel = RelTag {
spcnode: trouble_key.field2,
dbnode: trouble_key.field3,
relnode: trouble_key.field4,
forknum: trouble_key.field5,
};
for meta in batch.metadata.iter().filter_map(|m| match m {
ValueMeta::Serialized(serialized_value_meta) => Some(serialized_value_meta),
ValueMeta::Observed(_) => None,
}) {
let key = Key::from_compact(meta.key);
let rel = RelTag {
spcnode: key.field2,
dbnode: key.field3,
relnode: key.field4,
forknum: key.field5,
};
if rel == trouble_rel {
tracing::info!("Put for {key} at LSN {}", meta.lsn);
self.extra_log = true;
}
}
for meta in batch.metadata.iter() {
let key = Key::from_compact(meta.key());
let (rel, blkno) = key
@@ -1959,6 +1995,19 @@ impl DatadirModification<'_> {
"invalid relnode"
)))?;
}
let trouble_key = Key::from_hex("000000067F000040000000400600FFFFFFFF").unwrap();
let trouble_rel = RelTag {
spcnode: trouble_key.field2,
dbnode: trouble_key.field3,
relnode: trouble_key.field4,
forknum: trouble_key.field5,
};
if rel == trouble_rel {
self.extra_log = true;
}
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
@@ -1978,6 +2027,10 @@ impl DatadirModification<'_> {
true
};
if rel == trouble_rel {
tracing::info!(%dbdir_exists, "Maybe created db dir for {} at LSN {}", trouble_key.to_compact(), self.lsn);
}
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if !dbdir_exists {
// Create the RelDirectory
@@ -2023,6 +2076,10 @@ impl DatadirModification<'_> {
}
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
if rel == trouble_rel {
tracing::info!(%dbdir_exists, "Created v2 rel for {} at LSN {}", trouble_key.to_compact(), self.lsn);
}
} else {
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
@@ -2038,6 +2095,10 @@ impl DatadirModification<'_> {
rel_dir_key,
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
);
if rel == trouble_rel {
tracing::info!(%dbdir_exists, "Created v1 rel for {} at LSN {}", trouble_key.to_compact(), self.lsn);
}
}
// Put size
@@ -2472,11 +2533,19 @@ impl DatadirModification<'_> {
};
if let Some(batch) = maybe_batch {
tracing::debug!(
"Flushing batch with max_lsn={}. Last record LSN is {}",
batch.max_lsn,
self.tline.get_last_record_lsn()
);
if self.extra_log {
tracing::info!(
"Flushing batch with max_lsn={}. Last record LSN is {}",
batch.max_lsn,
self.tline.get_last_record_lsn()
);
} else {
tracing::debug!(
"Flushing batch with max_lsn={}. Last record LSN is {}",
batch.max_lsn,
self.tline.get_last_record_lsn()
);
}
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
@@ -2510,6 +2579,14 @@ impl DatadirModification<'_> {
self.pending_metadata_bytes = 0;
if self.extra_log {
tracing::info!(
"Flushed batch. Last record LSN is {}",
self.tline.get_last_record_lsn()
);
self.extra_log = false;
}
Ok(())
}
@@ -2587,11 +2664,6 @@ impl DatadirModification<'_> {
}
}
#[cfg(test)]
pub fn put_for_unit_test(&mut self, key: Key, val: Value) {
self.put(key, val);
}
fn put(&mut self, key: Key, val: Value) {
if Self::is_data_key(&key) {
self.put_data(key.to_compact(), val)
@@ -2605,6 +2677,26 @@ impl DatadirModification<'_> {
.pending_data_batch
.get_or_insert_with(SerializedValueBatch::default);
batch.put(key, val, self.lsn);
let trouble_key = Key::from_hex("000000067F000040000000400600FFFFFFFF").unwrap();
let trouble_rel = RelTag {
spcnode: trouble_key.field2,
dbnode: trouble_key.field3,
relnode: trouble_key.field4,
forknum: trouble_key.field5,
};
let key = Key::from_compact(key);
let rel = RelTag {
spcnode: key.field2,
dbnode: key.field3,
relnode: key.field4,
forknum: key.field5,
};
if rel == trouble_rel {
tracing::info!("Put for {key} at LSN {}", self.lsn);
self.extra_log = true;
}
}
fn put_metadata(&mut self, key: CompactKey, val: Value) {
@@ -2631,6 +2723,14 @@ impl DatadirModification<'_> {
if key == CHECKPOINT_KEY.to_compact() {
tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}");
}
let trouble_key = Key::from_hex("000000067F000040000000400600FFFFFFFF")
.unwrap()
.to_compact();
if key == trouble_key {
tracing::info!("Put for {trouble_key} at LSN {}", self.lsn);
self.extra_log = true;
}
}
fn delete(&mut self, key_range: Range<Key>) {

View File

@@ -5931,9 +5931,7 @@ mod tests {
use itertools::Itertools;
#[cfg(feature = "testing")]
use models::CompactLsnRange;
use pageserver_api::key::{
AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key,
};
use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
use pageserver_api::keyspace::KeySpace;
#[cfg(feature = "testing")]
use pageserver_api::keyspace::KeySpaceRandomAccum;
@@ -8169,54 +8167,6 @@ mod tests {
assert_eq!(files.get("pg_logical/mappings/test2"), None);
}
#[tokio::test]
async fn test_repl_origin_tombstones() {
let harness = TenantHarness::create("test_repl_origin_tombstones")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let io_concurrency = IoConcurrency::spawn_for_test();
let mut lsn = Lsn(0x08);
let tline: Arc<Timeline> = tenant
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
let repl_lsn = Lsn(0x10);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new()));
modification.set_replorigin(1, repl_lsn).await.unwrap();
modification.commit(&ctx).await.unwrap();
}
// we can read everything from the storage
let repl_origins = tline
.get_replorigins(lsn, &ctx, io_concurrency.clone())
.await
.unwrap();
assert_eq!(repl_origins.len(), 1);
assert_eq!(repl_origins[&1], lsn);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification.put_for_unit_test(
repl_origin_key(3),
Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")),
);
modification.commit(&ctx).await.unwrap();
}
let result = tline
.get_replorigins(lsn, &ctx, io_concurrency.clone())
.await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_metadata_image_creation() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_image_creation").await?;

View File

@@ -178,7 +178,7 @@ impl Attempt {
}
}
pub(crate) async fn generate_tombstone_image_layer(
async fn generate_tombstone_image_layer(
detached: &Arc<Timeline>,
ancestor: &Arc<Timeline>,
ancestor_lsn: Lsn,

View File

@@ -8,7 +8,7 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
use crate::proxy::retry::CouldRetry;
/// A go-to error message which doesn't leak any detail.
pub(crate) const REQUEST_FAILED: &str = "Console request failed";
pub(crate) const REQUEST_FAILED: &str = "Control plane request failed";
/// Common console API error.
#[derive(Debug, Error)]

View File

@@ -323,6 +323,7 @@ class NeonProject:
if self.restart_pgbench_on_console_errors and (
"ERROR: Couldn't connect to compute node" in err
or "ERROR: Console request failed" in err
or "ERROR: Control plane request failed" in err
):
log.info("Restarting benchmark for %s", target)
self.benchmarks.pop(target)