pageserver: verify gRPC GetPages on correct shard (#12722)

Verify that gRPC `GetPageRequest` has been sent to the shard that owns the pages. This avoid spurious `NotFound` errors if a compute misroutes a request, which can appear scarier (e.g. data loss). Touches [LKB-191](https://databricks.atlassian.net/browse/LKB-191).
2026-06-04 14:00:38 +00:00 · 2025-07-25 15:43:04 +02:00
parent 37e322438b
commit 185ead8395
1 changed files with 13 additions and 2 deletions
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3429,8 +3429,6 @@ impl GrpcPageServiceHandler {
    /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
    /// GetPageResponse with an appropriate status code to avoid terminating the stream.
    ///
-    /// TODO: verify that the requested pages belong to this shard.
-    ///
    /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
    /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
    /// split them up in the client or server.
@@ -3456,6 +3454,19 @@ impl GrpcPageServiceHandler {
            lsn = %req.read_lsn,
        );

+        for &blkno in &req.block_numbers {
+            let shard = timeline.get_shard_identity();
+            let key = rel_block_to_key(req.rel, blkno);
+            if !shard.is_key_local(&key) {
+                return Err(tonic::Status::invalid_argument(format!(
+                    "block {blkno} of relation {} requested on wrong shard {} (is on {})",
+                    req.rel,
+                    timeline.get_shard_index(),
+                    ShardIndex::new(shard.get_shard_number(&key), shard.count),
+                )));
+            }
+        }
+
        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
        let effective_lsn = PageServerHandler::effective_request_lsn(
            &timeline,