diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index a2a9165184..c293ad705b 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -8,12 +8,89 @@ use hex::FromHex; use serde::{Deserialize, Serialize}; use utils::id::TenantId; +/// See docs/rfcs/031-sharding-static.md for an overview of sharding. +/// +/// This module contains a variety of types used to represent the concept of sharding +/// a Neon tenant across multiple physical shards. Since there are quite a few of these, +/// we provide an summary here. +/// +/// Types used to describe shards: +/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +/// which identifies a tenant which is not shard-aware. This means its storage paths do not include +/// a shard suffix. +/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +/// without the tenant ID. This is useful for things that are implicitly scoped to a particular +/// tenant, such as layer files. +/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +/// four hex digits. An unsharded tenant is `0000`. +/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +/// +/// Types used to describe the parameters for data distribution in a sharded tenant: +/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +/// multiple shards. Its value is given in 8kiB pages. +/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +/// always zero: this is provided for future upgrades that might introduce different +/// data distribution schemes. +/// +/// Examples: +/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +/// and their slugs are 0004, 0104, 0204, and 0304. + #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(u8); +/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, +/// when we need to know which shard we're dealing with, but do not need to know the full +/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know +/// the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], +/// and to check whether that [`ShardNumber`] is the same as the current shard. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardIdentity { + pub number: ShardNumber, + pub count: ShardCount, + pub stripe_size: ShardStripeSize, + layout: ShardLayout, +} + +/// Formatting helper, for generating the `shard_id` label in traces. +struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. +/// +/// These are written as `-`, for example: +/// # The second shard in a two-shard tenant +/// 072f1291a5310026820b2fe4b2968934-0102 +/// +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. +/// +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be +/// decoded as a TenantShardId, and when re-encoded it will be parseable +/// as a TenantId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct TenantShardId { + pub tenant_id: TenantId, + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + impl ShardCount { pub const MAX: Self = Self(u8::MAX); @@ -38,6 +115,7 @@ impl ShardCount { self.0 } + /// pub fn is_unsharded(&self) -> bool { self.0 == 0 } @@ -53,33 +131,6 @@ impl ShardNumber { pub const MAX: Self = Self(u8::MAX); } -/// TenantShardId identify the units of work for the Pageserver. -/// -/// These are written as `-`, for example: -/// -/// # The second shard in a two-shard tenant -/// 072f1291a5310026820b2fe4b2968934-0102 -/// -/// Historically, tenants could not have multiple shards, and were identified -/// by TenantId. To support this, TenantShardId has a special legacy -/// mode where `shard_count` is equal to zero: this represents a single-sharded -/// tenant which should be written as a TenantId with no suffix. -/// -/// The human-readable encoding of TenantShardId, such as used in API URLs, -/// is both forward and backward compatible: a legacy TenantId can be -/// decoded as a TenantShardId, and when re-encoded it will be parseable -/// as a TenantId. -/// -/// Note that the binary encoding is _not_ backward compatible, because -/// at the time sharding is introduced, there are no existing binary structures -/// containing TenantId that we need to handle. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct TenantShardId { - pub tenant_id: TenantId, - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl TenantShardId { pub fn unsharded(tenant_id: TenantId) -> Self { Self { @@ -111,10 +162,13 @@ impl TenantShardId { } /// Convenience for code that has special behavior on the 0th shard. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.shard_number == ShardNumber(0) } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() } @@ -150,9 +204,6 @@ impl TenantShardId { } } -/// Formatting helper -struct ShardSlug<'a>(&'a TenantShardId); - impl<'a> std::fmt::Display for ShardSlug<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -222,16 +273,6 @@ impl From<[u8; 18]> for TenantShardId { } } -/// For use within the context of a particular tenant, when we need to know which -/// shard we're dealing with, but do not need to know the full ShardIdentity (because -/// we won't be doing any page->shard mapping), and do not need to know the fully qualified -/// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl ShardIndex { pub fn new(number: ShardNumber, count: ShardCount) -> Self { Self { @@ -246,6 +287,9 @@ impl ShardIndex { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) } @@ -313,6 +357,8 @@ impl Serialize for TenantShardId { if serializer.is_human_readable() { serializer.collect_str(self) } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. let mut packed: [u8; 18] = [0; 18]; packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[16] = self.shard_number.0; @@ -390,16 +436,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// Default stripe size in pages: 256MiB divided by 8kiB page size. const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); -/// The ShardIdentity contains the information needed for one member of map -/// to resolve a key to a shard, and then check whether that shard is ==self. -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] -pub struct ShardIdentity { - pub number: ShardNumber, - pub count: ShardCount, - pub stripe_size: ShardStripeSize, - layout: ShardLayout, -} - #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { #[error("Invalid shard count")] @@ -439,6 +475,9 @@ impl ShardIdentity { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } @@ -487,6 +526,8 @@ impl ShardIdentity { } /// Return true if the key should be ingested by this shard + /// + /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { @@ -497,7 +538,9 @@ impl ShardIdentity { } /// Return true if the key should be discarded if found in this shard's - /// data store, e.g. during compaction after a split + /// data store, e.g. during compaction after a split. + /// + /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { if key_is_shard0(key) { // Q: Why can't we dispose of shard0 content if we're not shard 0? @@ -523,7 +566,7 @@ impl ShardIdentity { /// Convenience for checking if this identity is the 0th shard in a tenant, /// for special cases on shard 0 such as ingesting relation sizes. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.number == ShardNumber(0) } } diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index f5540e896f..62bbde42f4 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker( continue; } - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // We only send consumption metrics from shard 0, so don't waste time calculating // synthetic size on other shards. continue; diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 6740c1360b..7ba2d04c4f 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { - if state != TenantState::Active || !id.is_zero() { + if state != TenantState::Active || !id.is_shard_zero() { None } else { tenant_manager diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 47d8ae1148..0b8c991f11 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -696,7 +696,7 @@ async fn get_lsn_by_timestamp_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -747,7 +747,7 @@ async fn get_timestamp_of_lsn_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -1086,7 +1086,7 @@ async fn tenant_size_handler( let headers = request.headers(); let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" ))); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 3160f204e2..6755c15c30 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2089,7 +2089,7 @@ impl TimelineMetrics { pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // Only shard zero deals in synthetic sizes - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 17ff033e00..2eac1247f7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3190,7 +3190,7 @@ impl Tenant { run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; // Upload the created data dir to S3 - if self.tenant_shard_id().is_zero() { + if self.tenant_shard_id().is_shard_zero() { self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) .await?; } @@ -3437,7 +3437,7 @@ impl Tenant { .store(size, Ordering::Relaxed); // Only shard zero should be calculating synthetic sizes - debug_assert!(self.shard_identity.is_zero()); + debug_assert!(self.shard_identity.is_shard_zero()); TENANT_SYNTHETIC_SIZE_METRIC .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 137fe48b73..0227331953 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant( let warn_after = 3; let max_attempts = 10; let mut prefixes = Vec::with_capacity(2); - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { // Also recover the unsharded prefix for a shard of zero: // - if the tenant is totally unsharded, the unsharded prefix contains all the data // - if the tenant is sharded, we still want to recover the initdb data, but we only diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d046a60af4..46b3d41e2b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1344,7 +1344,7 @@ impl Timeline { background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - if self.tenant_shard_id.is_zero() { + if self.tenant_shard_id.is_shard_zero() { // Logical size is only maintained accurately on shard zero. self.spawn_initial_logical_size_computation_task(ctx); } @@ -2237,7 +2237,7 @@ impl Timeline { priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Logical size is only accurately maintained on shard zero: when called elsewhere, for example // when HTTP API is serving a GET for timeline zero, return zero return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); @@ -2533,7 +2533,7 @@ impl Timeline { crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); // We should never be calculating logical sizes on shard !=0, because these shards do not have // accurate relation sizes, and they do not emit consumption metrics. - debug_assert!(self.tenant_shard_id.is_zero()); + debug_assert!(self.tenant_shard_id.is_shard_zero()); let guard = self .gate diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 522c5b57de..304d0d60ee 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -378,7 +378,7 @@ impl Timeline { gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore // skip imitating logical size accesses for eviction purposes. diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3f3419e886..c6ee6b90c4 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let current_timeline_size = if timeline.tenant_shard_id.is_zero() { + let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { timeline .get_current_logical_size( crate::tenant::timeline::GetLogicalSizePriority::User, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 9c7e8748d5..4f83b118ae 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -403,7 +403,7 @@ impl WalIngest { ); if !key_is_local { - if self.shard.is_zero() { + if self.shard.is_shard_zero() { // Shard 0 tracks relation sizes. Although we will not store this block, we will observe // its blkno in case it implicitly extends a relation. self.observe_decoded_block(modification, blk, ctx).await?; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 010558b797..4ee189dac9 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2744,7 +2744,7 @@ impl Service { let mut describe_shards = Vec::new(); for shard in shards { - if shard.tenant_shard_id.is_zero() { + if shard.tenant_shard_id.is_shard_zero() { shard_zero = Some(shard); } @@ -4084,7 +4084,7 @@ impl Service { let mut reconciles_spawned = 0; for (tenant_shard_id, shard) in tenants.iter_mut() { - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { schedule_context = ScheduleContext::default(); } @@ -4134,7 +4134,7 @@ impl Service { let mut work = Vec::new(); for (tenant_shard_id, shard) in tenants.iter() { - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { // Reset accumulators on the first shard in a tenant schedule_context = ScheduleContext::default(); tenant_shards.clear();