diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 16651c322e..80183506d8 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -124,6 +124,9 @@ impl KeySpaceAccum { if range.start == accum.end { accum.end = range.end; } else { + // TODO: to efficiently support small sharding stripe sizes, we should avoid starting + // a new range here if the skipped region was all keys that don't belong on this shard. + // (https://github.com/neondatabase/neon/issues/6247) assert!(range.start > accum.end); self.ranges.push(accum.clone()); *accum = range; diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 3e4936eec4..a186d93bce 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -422,6 +422,21 @@ impl ShardIdentity { } } + /// Return true if the key should be discarded if found in this shard's + /// data store, e.g. during compaction after a split + pub fn is_key_disposable(&self, key: &Key) -> bool { + if key_is_shard0(key) { + // Q: Why can't we dispose of shard0 content if we're not shard 0? + // A: because the WAL ingestion logic currently ingests some shard 0 + // content on all shards, even though it's only read on shard 0. If we + // dropped it, then subsequent WAL ingest to these keys would encounter + // an error. + false + } else { + !self.is_key_local(key) + } + } + pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 15a5ca1727..e8340a74b2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -496,6 +496,11 @@ impl Timeline { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); } + // This check is debug-only because of the cost of hashing, and because it's a double-check: we + // already checked the key against the shard_identity when looking up the Timeline from + // page_service. + debug_assert!(!self.shard_identity.is_key_disposable(&key)); + // XXX: structured stats collection for layer eviction here. trace!( "get page request for {}@{} from task kind {:?}", @@ -2224,13 +2229,13 @@ impl Timeline { return Err(layer_traversal_error( if cfg!(test) { format!( - "could not find data for key {} at LSN {}, for request at LSN {}\n{}", - key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), ) } else { format!( - "could not find data for key {} at LSN {}, for request at LSN {}", - key, cont_lsn, request_lsn + "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", + key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn ) }, traversal_path, @@ -3054,6 +3059,15 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + key = key.next(); + continue; + } let img = match self.get(key, lsn, ctx).await { Ok(img) => img, Err(err) => { @@ -3080,6 +3094,7 @@ impl Timeline { } } }; + image_layer_writer.put_image(key, &img).await?; key = key.next(); } @@ -3650,7 +3665,15 @@ impl Timeline { ))) }); - writer.as_mut().unwrap().put_value(key, lsn, value).await?; + if !self.shard_identity.is_key_disposable(&key) { + writer.as_mut().unwrap().put_value(key, lsn, value).await?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } if !new_layers.is_empty() { fail_point!("after-timeline-compacted-first-L1");