mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 20:50:37 +00:00
pageserver: verify gRPC GetPages on correct shard (#12722)
Verify that gRPC `GetPageRequest` has been sent to the shard that owns the pages. This avoid spurious `NotFound` errors if a compute misroutes a request, which can appear scarier (e.g. data loss). Touches [LKB-191](https://databricks.atlassian.net/browse/LKB-191).
This commit is contained in:
@@ -3429,8 +3429,6 @@ impl GrpcPageServiceHandler {
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: verify that the requested pages belong to this shard.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
@@ -3456,6 +3454,19 @@ impl GrpcPageServiceHandler {
|
||||
lsn = %req.read_lsn,
|
||||
);
|
||||
|
||||
for &blkno in &req.block_numbers {
|
||||
let shard = timeline.get_shard_identity();
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
if !shard.is_key_local(&key) {
|
||||
return Err(tonic::Status::invalid_argument(format!(
|
||||
"block {blkno} of relation {} requested on wrong shard {} (is on {})",
|
||||
req.rel,
|
||||
timeline.get_shard_index(),
|
||||
ShardIndex::new(shard.get_shard_number(&key), shard.count),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
|
||||
let effective_lsn = PageServerHandler::effective_request_lsn(
|
||||
&timeline,
|
||||
|
||||
Reference in New Issue
Block a user