diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 99f0d374c1..db9715dc62 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -41,7 +41,7 @@ use pageserver_api::controller_api::{ use pageserver_api::models::{ ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, }; -use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; +use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; use safekeeper_api::membership::SafekeeperGeneration; @@ -1117,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any stripe_size: args .shard_stripe_size .map(ShardStripeSize) - .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), + .unwrap_or(DEFAULT_STRIPE_SIZE), }, placement_policy: args.placement_policy.clone(), config: tenant_conf, @@ -1430,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res vec![(parsed.0, parsed.1.unwrap_or(5432))], // If caller is telling us what pageserver to use, this is not a tenant which is // full managed by storage controller, therefore not sharded. - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) } else { // Look up the currently attached location of the tenant, and its striping metadata, diff --git a/docs/storage_controller.md b/docs/storage_controller.md index ac4aca4219..d761210033 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -151,7 +151,7 @@ Example body: ``` { "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", - "stripe_size": 32768, + "stripe_size": 2048, "shards": [ {"node_id": 344, "shard_number": 0}, {"node_id": 722, "shard_number": 1}, diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index e505f23e49..79e3ef553b 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -613,8 +613,7 @@ mod tests { use rand::{RngCore, SeedableRng}; use super::*; - use crate::models::ShardParameters; - use crate::shard::{ShardCount, ShardNumber}; + use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize}; // Helper function to create a key range. // @@ -964,12 +963,8 @@ mod tests { } #[test] fn sharded_range_relation_gap() { - let shard_identity = ShardIdentity::new( - ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -985,12 +980,8 @@ mod tests { #[test] fn shard_identity_keyspaces_single_key() { - let shard_identity = ShardIdentity::new( - ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -1034,12 +1025,8 @@ mod tests { #[test] fn shard_identity_keyspaces_forkno_gap() { - let shard_identity = ShardIdentity::new( - ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -1061,7 +1048,7 @@ mod tests { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) .unwrap(); @@ -1144,37 +1131,44 @@ mod tests { /// for a single tenant. #[test] fn sharded_range_fragment_simple() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which we happen to know covers exactly one stripe which belongs to this shard let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap(); + let mut input_end = input_start; + input_end.field6 += STRIPE_SIZE; // field6 is block number // Ask for stripe_size blocks, we get the whole stripe assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 32768), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for more, we still get the whole stripe assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 10000000), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for target_nblocks of half the stripe size, we get two halves assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 16384), + do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2), ( - 32768, + STRIPE_SIZE, vec![ - (16384, input_start..input_start.add(16384)), - (16384, input_start.add(16384)..input_end) + ( + STRIPE_SIZE / 2, + input_start..input_start.add(STRIPE_SIZE / 2) + ), + (STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end) ] ) ); @@ -1182,40 +1176,53 @@ mod tests { #[test] fn sharded_range_fragment_multi_stripe() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; + let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which covers multiple stripes, exactly one of which belongs to the current shard. let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let mut input_end = input_start; + input_end.field6 += RANGE_SIZE; // field6 is block number + // Ask for all the blocks, get a fragment that covers the whole range but reports // its size to be just the blocks belonging to our shard. assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 131072), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); - // Ask for a sub-stripe quantity + // Ask for a sub-stripe quantity that results in 3 fragments. + let limit = STRIPE_SIZE / 3 + 1; assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 16000), + do_fragment(input_start, input_end, &shard_identity, limit), ( - 32768, + STRIPE_SIZE, vec![ - (16000, input_start..input_start.add(16000)), - (16000, input_start.add(16000)..input_start.add(32000)), - (768, input_start.add(32000)..input_end), + (limit, input_start..input_start.add(limit)), + (limit, input_start.add(limit)..input_start.add(2 * limit)), + ( + STRIPE_SIZE - 2 * limit, + input_start.add(2 * limit)..input_end + ), ] ) ); // Try on a range that starts slightly after our owned stripe assert_eq!( - do_fragment(input_start.add(1), input_end, &shard_identity, 131072), - (32767, vec![(32767, input_start.add(1)..input_end)]) + do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE), + ( + STRIPE_SIZE - 1, + vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)] + ) ); } @@ -1223,32 +1230,40 @@ mod tests { /// a previous relation. #[test] fn sharded_range_fragment_starting_from_logical_size() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; + let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap(); + let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap(); + input_end.field6 += RANGE_SIZE; // field6 is block number // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 0x10000), - (0x8001, vec![(0x8001, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), + ( + STRIPE_SIZE + 1, + vec![(STRIPE_SIZE + 1, input_start..input_end)] + ) ); // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards // store all logical sizes) let shard_identity = ShardIdentity::new( ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 0x10000), - (0x1, vec![(0x1, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), + (1, vec![(1, input_start..input_end)]) ); } @@ -1284,12 +1299,8 @@ mod tests { ); // Same, but using a sharded identity - let shard_identity = ShardIdentity::new( - ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 0x8000), (u32::MAX, vec![(u32::MAX, input_start..input_end),]) @@ -1331,7 +1342,7 @@ mod tests { ShardIdentity::new( ShardNumber((prng.next_u32() % shard_count) as u8), ShardCount::new(shard_count as u8), - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) .unwrap() }; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 8186889e10..34a419f2cf 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -26,7 +26,7 @@ use utils::{completion, serde_system_time}; use crate::config::Ratio; use crate::key::{CompactKey, Key}; use crate::reltag::RelTag; -use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; +use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. /// @@ -438,8 +438,6 @@ pub struct ShardParameters { } impl ShardParameters { - pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); - pub fn is_unsharded(&self) -> bool { self.count.is_unsharded() } @@ -449,7 +447,7 @@ impl Default for ShardParameters { fn default() -> Self { Self { count: ShardCount::new(0), - stripe_size: Self::DEFAULT_STRIPE_SIZE, + stripe_size: DEFAULT_STRIPE_SIZE, } } } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index abbf4e6432..feb59f5070 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -92,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1); /// ShardIdentity uses a magic layout value to indicate if it is unusable const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); -/// Default stripe size in pages: 256MiB divided by 8kiB page size. -const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); +/// The default stripe size in pages. 16 MiB divided by 8 kiB page size. +/// +/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization. +/// 16 MiB appears to be a reasonable balance: . +pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8); #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { @@ -543,7 +546,7 @@ mod tests { field6: 0x7d06, }; - let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key); + let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key); assert_eq!(shard, ShardNumber(8)); } diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 31ab443ccd..2311cadb36 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -800,7 +800,7 @@ impl ComputeHook { #[cfg(test)] pub(crate) mod tests { - use pageserver_api::shard::{ShardCount, ShardNumber}; + use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber}; use utils::id::TenantId; use super::*; @@ -808,6 +808,7 @@ pub(crate) mod tests { #[test] fn tenant_updates() -> anyhow::Result<()> { let tenant_id = TenantId::generate(); + let stripe_size = DEFAULT_STRIPE_SIZE; let mut tenant_state = ComputeHookTenant::new( TenantShardId { tenant_id, @@ -848,7 +849,7 @@ pub(crate) mod tests { shard_count: ShardCount::new(2), shard_number: ShardNumber(1), }, - stripe_size: ShardStripeSize(32768), + stripe_size, preferred_az: None, node_id: NodeId(1), }); @@ -864,7 +865,7 @@ pub(crate) mod tests { shard_count: ShardCount::new(2), shard_number: ShardNumber(0), }, - stripe_size: ShardStripeSize(32768), + stripe_size, preferred_az: None, node_id: NodeId(1), }); @@ -874,7 +875,7 @@ pub(crate) mod tests { anyhow::bail!("Wrong send result"); }; assert_eq!(request.shards.len(), 2); - assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + assert_eq!(request.stripe_size, Some(stripe_size)); // Simulate successful send *guard = Some(ComputeRemoteState { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index c1c2e2c189..2ef09cd2e3 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -43,7 +43,7 @@ use pageserver_api::models::{ TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, + DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::{ ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, @@ -2754,7 +2754,7 @@ impl Service { count: tenant_shard_id.shard_count, // We only import un-sharded or single-sharded tenants, so stripe // size can be made up arbitrarily here. - stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, + stripe_size: DEFAULT_STRIPE_SIZE, }, placement_policy: Some(placement_policy), config: req.config.tenant_conf, @@ -7865,7 +7865,7 @@ impl Service { // old, persisted stripe size. let new_stripe_size = match candidate.id.shard_count.count() { 0 => panic!("invalid shard count 0"), - 1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE), + 1 => Some(DEFAULT_STRIPE_SIZE), 2.. => None, }; diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 8424c65aba..3a75e96cb2 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -2000,7 +2000,7 @@ pub(crate) mod tests { use std::rc::Rc; use pageserver_api::controller_api::NodeAvailability; - use pageserver_api::shard::{ShardCount, ShardNumber}; + use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber}; use rand::SeedableRng; use rand::rngs::StdRng; use utils::id::TenantId; @@ -2012,6 +2012,7 @@ pub(crate) mod tests { let tenant_id = TenantId::generate(); let shard_number = ShardNumber(0); let shard_count = ShardCount::new(1); + let stripe_size = DEFAULT_STRIPE_SIZE; let tenant_shard_id = TenantShardId { tenant_id, @@ -2020,12 +2021,7 @@ pub(crate) mod tests { }; TenantShard::new( tenant_shard_id, - ShardIdentity::new( - shard_number, - shard_count, - pageserver_api::shard::ShardStripeSize(32768), - ) - .unwrap(), + ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(), policy, None, ) @@ -2045,6 +2041,7 @@ pub(crate) mod tests { shard_count: ShardCount, preferred_az: Option, ) -> Vec { + let stripe_size = DEFAULT_STRIPE_SIZE; (0..shard_count.count()) .map(|i| { let shard_number = ShardNumber(i); @@ -2056,12 +2053,7 @@ pub(crate) mod tests { }; TenantShard::new( tenant_shard_id, - ShardIdentity::new( - shard_number, - shard_count, - pageserver_api::shard::ShardStripeSize(32768), - ) - .unwrap(), + ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(), policy.clone(), preferred_az.clone(), ) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 0175794a57..ce73c9a738 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -677,7 +677,7 @@ def test_storage_controller_compute_hook( env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) expect = { "tenant_id": str(env.initial_tenant), - "stripe_size": 32768, + "stripe_size": 2048, "shards": [ {"node_id": int(env.pageservers[1].id), "shard_number": 0}, {"node_id": int(env.pageservers[1].id), "shard_number": 1},