From cdb6479c8abd87df7c0c535ced25aeef5991a983 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 19 May 2025 11:03:06 +0200 Subject: [PATCH] pageserver: add gRPC page service schema (#11815) ## Problem For the [communicator project](https://github.com/neondatabase/company_projects/issues/352), we want to move to gRPC for the page service protocol. Touches #11728. ## Summary of changes This patch adds an experimental gRPC Protobuf schema for the page service. It is equivalent to the current page service, but with several improvements, e.g.: * Connection multiplexing. * Reduced head-of-line blocking. * Client-side batching. * Explicit tenant shard routing. * GetPage request classification (normal vs. prefetch). * Explicit rate limiting ("slow down" response status). The API is exposed as a new `pageserver/page_api` package. This is separate from the `pageserver_api` package to reduce the dependency footprint for the communicator. The longer-term plan is to also split out e.g. the WAL ingestion service to a separate gRPC package, e.g. `pageserver/wal_api`. Subsequent PRs will: add Rust domain types for the Protobuf types, expose a gRPC server, and implement the page service. Preliminary prototype benchmarks of this gRPC API is within 10% of baseline libpq performance. We'll do further benchmarking and optimization as the implementation lands in `main` and is deployed to staging. --- Cargo.lock | 10 + Cargo.toml | 2 + pageserver/page_api/Cargo.toml | 13 ++ pageserver/page_api/build.rs | 7 + pageserver/page_api/proto/page_service.proto | 220 +++++++++++++++++++ pageserver/page_api/src/lib.rs | 14 ++ 6 files changed, 266 insertions(+) create mode 100644 pageserver/page_api/Cargo.toml create mode 100644 pageserver/page_api/build.rs create mode 100644 pageserver/page_api/proto/page_service.proto create mode 100644 pageserver/page_api/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 8ca65b58ce..d919537818 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4434,6 +4434,16 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_page_api" +version = "0.1.0" +dependencies = [ + "prost 0.13.3", + "tonic", + "tonic-build", + "workspace_hack", +] + [[package]] name = "papaya" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 74b281f88f..a280c446b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "pageserver/ctl", "pageserver/client", "pageserver/pagebench", + "pageserver/page_api", "proxy", "safekeeper", "safekeeper/client", @@ -252,6 +253,7 @@ pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } +pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml new file mode 100644 index 0000000000..c237949226 --- /dev/null +++ b/pageserver/page_api/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "pageserver_page_api" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +prost.workspace = true +tonic.workspace = true +workspace_hack.workspace = true + +[build-dependencies] +tonic-build.workspace = true diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs new file mode 100644 index 0000000000..ce3c49ed82 --- /dev/null +++ b/pageserver/page_api/build.rs @@ -0,0 +1,7 @@ +fn main() -> Result<(), Box> { + // Generates Rust code from .proto Protobuf schemas. + tonic_build::configure() + .bytes(["."]) + .compile_protos(&["proto/page_service.proto"], &["proto"]) + .map_err(|err| err.into()) +} diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto new file mode 100644 index 0000000000..12e4d2f9db --- /dev/null +++ b/pageserver/page_api/proto/page_service.proto @@ -0,0 +1,220 @@ +// Page service, presented by pageservers for computes. +// +// This is the compute read path. It primarily serves page versions at given +// LSNs, but also base backups, SLRU segments, and relation metadata. +// +// EXPERIMENTAL: this is still under development and subject to change. +// +// Request metadata headers: +// - authorization: JWT token ("Bearer "), if auth is enabled +// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980") +// - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) +// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") +// +// TODO: consider adding neon-compute-mode ("primary", "static", "replica"). +// However, this will require reconnecting when changing modes. +// +// TODO: write implementation guidance on +// - Health checks +// - Tracing, OpenTelemetry +// - Compression + +syntax = "proto3"; +package page_service; + +service PageService { + // Returns whether a relation exists. + rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); + + // Fetches a base backup. + rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); + + // Returns the total size of a database, as # of bytes. + rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse); + + // Fetches pages. + // + // This is implemented as a bidirectional streaming RPC for performance. Unary + // requests incur costs for e.g. HTTP/2 stream setup, header parsing, + // authentication, and so on -- with streaming, we only pay these costs during + // the initial stream setup. This ~doubles throughput in benchmarks. Other + // RPCs use regular unary requests, since they are not as frequent and + // performance-critical, and this simplifies implementation. + // + // NB: a status response (e.g. errors) will terminate the stream. The stream + // may be shared by e.g. multiple Postgres backends, so we should avoid this. + // Most errors are therefore sent as GetPageResponse.status instead. + rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse); + + // Returns the size of a relation, as # of blocks. + rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse); + + // Fetches an SLRU segment. + rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); +} + +// The LSN a request should read at. +message ReadLsn { + // The request's read LSN. Required. + uint64 request_lsn = 1; + // If given, the caller guarantees that the page has not been modified since + // this LSN. Must be smaller than or equal to request_lsn. This allows the + // Pageserver to serve an old page without waiting for the request LSN to + // arrive. Valid for all request types. + // + // It is undefined behaviour to make a request such that the page was, in + // fact, modified between request_lsn and not_modified_since_lsn. The + // Pageserver might detect it and return an error, or it might return the old + // page version or the new page version. Setting not_modified_since_lsn equal + // to request_lsn is always safe, but can lead to unnecessary waiting. + uint64 not_modified_since_lsn = 2; +} + +// A relation identifier. +message RelTag { + uint32 spc_oid = 1; + uint32 db_oid = 2; + uint32 rel_number = 3; + uint32 fork_number = 4; +} + +// Checks whether a relation exists, at the given LSN. Only valid on shard 0, +// other shards will error. +message CheckRelExistsRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message CheckRelExistsResponse { + bool exists = 1; +} + +// Requests a base backup at a given LSN. +message GetBaseBackupRequest { + // The LSN to fetch a base backup at. + ReadLsn read_lsn = 1; + // If true, logical replication slots will not be created. + bool replica = 2; +} + +// Base backup response chunk, returned as an ordered stream. +message GetBaseBackupResponseChunk { + // A basebackup data chunk. The size is undefined, but bounded by the 4 MB + // gRPC message size limit. + bytes chunk = 1; +} + +// Requests the size of a database, as # of bytes. Only valid on shard 0, other +// shards will error. +message GetDbSizeRequest { + ReadLsn read_lsn = 1; + uint32 db_oid = 2; +} + +message GetDbSizeResponse { + uint64 num_bytes = 1; +} + +// Requests one or more pages. +message GetPageRequest { + // A request ID. Will be included in the response. Should be unique for + // in-flight requests on the stream. + uint64 request_id = 1; + // The request class. + GetPageClass request_class = 2; + // The LSN to read at. + ReadLsn read_lsn = 3; + // The relation to read from. + RelTag rel = 4; + // Page numbers to read. Must belong to the remote shard. + // + // Multiple pages will be executed as a single batch by the Pageserver, + // amortizing layer access costs and parallelizing them. This may increase the + // latency of any individual request, but improves the overall latency and + // throughput of the batch as a whole. + // + // TODO: this causes an allocation in the common single-block case. The sender + // can use a SmallVec to stack-allocate it, but Prost will always deserialize + // into a heap-allocated Vec. Consider optimizing this. + // + // TODO: we might be able to avoid a sort or something if we mandate that these + // are always in order. But we can't currenly rely on this on the server, because + // of compatibility with the libpq protocol handler. + repeated uint32 block_number = 5; +} + +// A GetPageRequest class. Primarily intended for observability, but may also be +// used for prioritization in the future. +enum GetPageClass { + // Unknown class. For forwards compatibility: used when the client sends a + // class that the server doesn't know about. + GET_PAGE_CLASS_UNKNOWN = 0; + // A normal request. This is the default. + GET_PAGE_CLASS_NORMAL = 1; + // A prefetch request. NB: can only be classified on pg < 18. + GET_PAGE_CLASS_PREFETCH = 2; + // A background request (e.g. vacuum). + GET_PAGE_CLASS_BACKGROUND = 3; +} + +// A GetPage response. +// +// A batch response will contain all of the requested pages. We could eagerly +// emit individual pages as soon as they are ready, but on a readv() Postgres +// holds buffer pool locks on all pages in the batch and we'll only return once +// the entire batch is ready, so no one can make use of the individual pages. +message GetPageResponse { + // The original request's ID. + uint64 request_id = 1; + // The response status code. + GetPageStatus status = 2; + // A string describing the status, if any. + string reason = 3; + // The 8KB page images, in the same order as the request. Empty if status != OK. + repeated bytes page_image = 4; +} + +// A GetPageResponse status code. Since we use a bidirectional stream, we don't +// want to send errors as gRPC statuses, since this would terminate the stream. +enum GetPageStatus { + // Unknown status. For forwards compatibility: used when the server sends a + // status code that the client doesn't know about. + GET_PAGE_STATUS_UNKNOWN = 0; + // The request was successful. + GET_PAGE_STATUS_OK = 1; + // The page did not exist. The tenant/timeline/shard has already been + // validated during stream setup. + GET_PAGE_STATUS_NOT_FOUND = 2; + // The request was invalid. + GET_PAGE_STATUS_INVALID = 3; + // The tenant is rate limited. Slow down and retry later. + GET_PAGE_STATUS_SLOW_DOWN = 4; + // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a + // layer download. This could free up the server task to process other + // requests while the layer download is in progress. +} + +// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on +// shard 0, other shards will error. +message GetRelSizeRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message GetRelSizeResponse { + uint32 num_blocks = 1; +} + +// Requests an SLRU segment. Only valid on shard 0, other shards will error. +message GetSlruSegmentRequest { + ReadLsn read_lsn = 1; + uint32 kind = 2; + uint32 segno = 3; +} + +// Returns an SLRU segment. +// +// These are up 32 pages (256 KB), so we can send them as a single response. +message GetSlruSegmentResponse { + bytes segment = 1; +} diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs new file mode 100644 index 0000000000..0226d594cb --- /dev/null +++ b/pageserver/page_api/src/lib.rs @@ -0,0 +1,14 @@ +//! This crate provides the Pageserver's page API. It contains: +//! +//! * proto/page_service.proto: the Protobuf schema for the page API. +//! * proto: auto-generated Protobuf types for gRPC. +//! +//! This crate is used by both the client and the server. Try to keep it slim. + +// Code generated by protobuf. +pub mod proto { + tonic::include_proto!("page_service"); + + pub use page_service_client::PageServiceClient; + pub use page_service_server::{PageService, PageServiceServer}; +}