diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index 16577e27d6..be7cff352c 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -9,6 +9,7 @@ use clap::Parser; use hex::FromHex; use hyper::StatusCode; use hyper::{Body, Request, Response}; +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; use std::{collections::HashMap, sync::Arc}; @@ -173,7 +174,8 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE if state.pageserver == Some(reattach_req.node_id) { state.generation += 1; response.tenants.push(ReAttachResponseTenant { - id: *t, + // TODO(sharding): make this shard-aware + id: TenantShardId::unsharded(*t), gen: state.generation, }); } @@ -196,7 +198,8 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr }; for req_tenant in validate_req.tenants { - if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { + // TODO(sharding): make this shard-aware + if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) { let valid = tenant_state.generation == req_tenant.gen; response.tenants.push(ValidateResponseTenant { id: req_tenant.id, diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs index 8232e81b98..0acc3a7bb0 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/control_api.rs @@ -4,7 +4,9 @@ //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; -use utils::id::{NodeId, TenantId}; +use utils::id::NodeId; + +use crate::shard::TenantShardId; #[derive(Serialize, Deserialize)] pub struct ReAttachRequest { @@ -13,7 +15,7 @@ pub struct ReAttachRequest { #[derive(Serialize, Deserialize)] pub struct ReAttachResponseTenant { - pub id: TenantId, + pub id: TenantShardId, pub gen: u32, } @@ -24,7 +26,7 @@ pub struct ReAttachResponse { #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { - pub id: TenantId, + pub id: TenantShardId, pub gen: u32, } @@ -40,6 +42,6 @@ pub struct ValidateResponse { #[derive(Serialize, Deserialize)] pub struct ValidateResponseTenant { - pub id: TenantId, + pub id: TenantShardId, pub valid: bool, } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 688b911425..3510b4dbca 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -5,10 +5,10 @@ use serde::{Deserialize, Serialize}; use thiserror; use utils::id::TenantId; -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)] +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)] +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(pub u8); impl ShardCount { @@ -39,7 +39,7 @@ impl ShardNumber { /// Note that the binary encoding is _not_ backward compatible, because /// at the time sharding is introduced, there are no existing binary structures /// containing TenantId that we need to handle. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)] +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct TenantShardId { pub tenant_id: TenantId, pub shard_number: ShardNumber, diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 735f358d8b..5d05af0c00 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -3,6 +3,7 @@ use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::storage_layer::PersistentLayerDesc; +use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; @@ -211,7 +212,7 @@ fn bench_sequential(c: &mut Criterion) { let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); let layer = PersistentLayerDesc::new_img( - TenantId::generate(), + TenantShardId::unsharded(TenantId::generate()), TimelineId::generate(), zero.add(10 * i32)..zero.add(10 * i32 + 1), Lsn(i), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 87d9cc522e..5b170af4ef 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,6 +5,7 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; +use pageserver_api::shard::TenantShardId; use remote_storage::{RemotePath, RemoteStorageConfig}; use serde::de::IntoDeserializer; use std::env; @@ -25,7 +26,7 @@ use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; use postgres_backend::AuthType; use utils::{ - id::{NodeId, TenantId, TimelineId}, + id::{NodeId, TimelineId}, logging::LogFormat, }; @@ -628,12 +629,13 @@ impl PageServerConf { self.deletion_prefix().join(format!("header-{VERSION:02x}")) } - pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenants_path().join(tenant_id.to_string()) + pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenants_path().join(tenant_shard_id.to_string()) } - pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME) + pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) + .join(IGNORED_TENANT_FILE_NAME) } /// Points to a place in pageserver's local directory, @@ -641,47 +643,53 @@ impl PageServerConf { /// /// Legacy: superseded by tenant_location_config_path. Eventually /// remove this function. - pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME) + pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME) } - pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id) + pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) .join(TENANT_LOCATION_CONFIG_NAME) } - pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) + pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) + .join(TIMELINES_SEGMENT_NAME) } - pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf { - self.timelines_path(tenant_id).join(timeline_id.to_string()) + pub fn timeline_path( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Utf8PathBuf { + self.timelines_path(tenant_shard_id) + .join(timeline_id.to_string()) } pub fn timeline_uninit_mark_file_path( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Utf8PathBuf { path_with_suffix_extension( - self.timeline_path(&tenant_id, &timeline_id), + self.timeline_path(&tenant_shard_id, &timeline_id), TIMELINE_UNINIT_MARK_SUFFIX, ) } pub fn timeline_delete_mark_file_path( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Utf8PathBuf { path_with_suffix_extension( - self.timeline_path(&tenant_id, &timeline_id), + self.timeline_path(&tenant_shard_id, &timeline_id), TIMELINE_DELETE_MARK_SUFFIX, ) } - pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id) + pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) .join(TENANT_DELETED_MARKER_FILE_NAME) } @@ -691,20 +699,24 @@ impl PageServerConf { pub fn trace_path( &self, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, connection_id: &ConnectionId, ) -> Utf8PathBuf { self.traces_path() - .join(tenant_id.to_string()) + .join(tenant_shard_id.to_string()) .join(timeline_id.to_string()) .join(connection_id.to_string()) } /// Points to a place in pageserver's local directory, /// where certain timeline's metadata file should be located. - pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf { - self.timeline_path(tenant_id, timeline_id) + pub fn metadata_path( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Utf8PathBuf { + self.timeline_path(tenant_shard_id, timeline_id) .join(METADATA_FILE_NAME) } diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 2989e15e8e..213f08484c 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -351,7 +351,7 @@ impl TimelineSnapshot { let last_record_lsn = t.get_last_record_lsn(); let current_exact_logical_size = { - let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id); + let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id); let res = span .in_scope(|| t.get_current_logical_size(ctx)) .context("get_current_logical_size"); diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index f50c19a629..25ae3d1b01 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -1,16 +1,15 @@ use std::collections::HashMap; -use pageserver_api::control_api::{ - ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, +use pageserver_api::{ + control_api::{ + ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, + }, + shard::TenantShardId, }; use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{ - backoff, - generation::Generation, - id::{NodeId, TenantId}, -}; +use utils::{backoff, generation::Generation, id::NodeId}; use crate::config::PageServerConf; @@ -31,11 +30,11 @@ pub enum RetryForeverError { #[async_trait::async_trait] pub trait ControlPlaneGenerationsApi { - async fn re_attach(&self) -> Result, RetryForeverError>; + async fn re_attach(&self) -> Result, RetryForeverError>; async fn validate( &self, - tenants: Vec<(TenantId, Generation)>, - ) -> Result, RetryForeverError>; + tenants: Vec<(TenantShardId, Generation)>, + ) -> Result, RetryForeverError>; } impl ControlPlaneClient { @@ -127,7 +126,7 @@ impl ControlPlaneClient { #[async_trait::async_trait] impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach(&self) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") @@ -154,8 +153,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down async fn validate( &self, - tenants: Vec<(TenantId, Generation)>, - ) -> Result, RetryForeverError> { + tenants: Vec<(TenantShardId, Generation)>, + ) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("validate") diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 4bc99eb94f..7b05745483 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -15,6 +15,7 @@ use crate::virtual_file::MaybeFatalIo; use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; +use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::Deserialize; use serde::Serialize; @@ -25,7 +26,7 @@ use tracing::Instrument; use tracing::{self, debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use utils::lsn::AtomicLsn; use utils::lsn::Lsn; @@ -193,7 +194,7 @@ struct DeletionList { /// nested HashMaps by TenantTimelineID. Each Tenant only appears once /// with one unique generation ID: if someone tries to push a second generation /// ID for the same tenant, we will start a new DeletionList. - tenants: HashMap, + tenants: HashMap, /// Avoid having to walk `tenants` to calculate the number of keys in /// the nested deletion lists @@ -265,7 +266,7 @@ impl DeletionList { /// deletion list. fn push( &mut self, - tenant: &TenantId, + tenant: &TenantShardId, timeline: &TimelineId, generation: Generation, objects: &mut Vec, @@ -357,7 +358,7 @@ struct TenantLsnState { #[derive(Default)] struct VisibleLsnUpdates { - tenants: HashMap, + tenants: HashMap, } impl VisibleLsnUpdates { @@ -414,7 +415,7 @@ impl DeletionQueueClient { pub(crate) fn recover( &self, - attached_tenants: HashMap, + attached_tenants: HashMap, ) -> Result<(), DeletionQueueError> { self.do_push( &self.tx, @@ -431,7 +432,7 @@ impl DeletionQueueClient { /// backend will later wake up and notice that the tenant's generation requires validation. pub(crate) async fn update_remote_consistent_lsn( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, lsn: Lsn, @@ -442,10 +443,13 @@ impl DeletionQueueClient { .write() .expect("Lock should never be poisoned"); - let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState { - timelines: HashMap::new(), - generation: current_generation, - }); + let tenant_entry = locked + .tenants + .entry(tenant_shard_id) + .or_insert(TenantLsnState { + timelines: HashMap::new(), + generation: current_generation, + }); if tenant_entry.generation != current_generation { // Generation might have changed if we were detached and then re-attached: in this case, @@ -472,7 +476,7 @@ impl DeletionQueueClient { /// generations in `layers` are the generations in which those layers were written. pub(crate) async fn push_layers( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, layers: Vec<(LayerFileName, LayerFileMetadata)>, @@ -483,7 +487,7 @@ impl DeletionQueueClient { let mut layer_paths = Vec::new(); for (layer, meta) in layers { layer_paths.push(remote_layer_path( - &tenant_id, + &tenant_shard_id.tenant_id, &timeline_id, meta.shard, &layer, @@ -494,7 +498,7 @@ impl DeletionQueueClient { return self.flush_immediate().await; } - self.push_layers_sync(tenant_id, timeline_id, current_generation, layers) + self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers) } /// When a Tenant has a generation, push_layers is always synchronous because @@ -504,7 +508,7 @@ impl DeletionQueueClient { /// support (``) pub(crate) fn push_layers_sync( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, layers: Vec<(LayerFileName, LayerFileMetadata)>, @@ -515,7 +519,7 @@ impl DeletionQueueClient { self.do_push( &self.tx, ListWriterQueueMessage::Delete(DeletionOp { - tenant_id, + tenant_shard_id, timeline_id, layers, generation: current_generation, @@ -783,12 +787,12 @@ mod test { } fn set_latest_generation(&self, gen: Generation) { - let tenant_id = self.harness.tenant_id; + let tenant_shard_id = self.harness.tenant_shard_id; self.mock_control_plane .latest_generation .lock() .unwrap() - .insert(tenant_id, gen); + .insert(tenant_shard_id, gen); } /// Returns remote layer file name, suitable for use in assert_remote_files @@ -797,8 +801,8 @@ mod test { file_name: LayerFileName, gen: Generation, ) -> anyhow::Result { - let tenant_id = self.harness.tenant_id; - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let tenant_shard_id = self.harness.tenant_shard_id; + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); std::fs::create_dir_all(&remote_timeline_path)?; let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); @@ -816,7 +820,7 @@ mod test { #[derive(Debug, Clone)] struct MockControlPlane { - pub latest_generation: std::sync::Arc>>, + pub latest_generation: std::sync::Arc>>, } impl MockControlPlane { @@ -830,20 +834,20 @@ mod test { #[async_trait::async_trait] impl ControlPlaneGenerationsApi for MockControlPlane { #[allow(clippy::diverging_sub_expression)] // False positive via async_trait - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach(&self) -> Result, RetryForeverError> { unimplemented!() } async fn validate( &self, - tenants: Vec<(TenantId, Generation)>, - ) -> Result, RetryForeverError> { + tenants: Vec<(TenantShardId, Generation)>, + ) -> Result, RetryForeverError> { let mut result = HashMap::new(); let latest_generation = self.latest_generation.lock().unwrap(); - for (tenant_id, generation) in tenants { - if let Some(latest) = latest_generation.get(&tenant_id) { - result.insert(tenant_id, *latest == generation); + for (tenant_shard_id, generation) in tenants { + if let Some(latest) = latest_generation.get(&tenant_shard_id) { + result.insert(tenant_shard_id, *latest == generation); } } @@ -947,10 +951,10 @@ mod test { client.recover(HashMap::new())?; let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); - let tenant_id = ctx.harness.tenant_id; + let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); let deletion_prefix = ctx.harness.conf.deletion_prefix(); @@ -980,7 +984,7 @@ mod test { info!("Pushing"); client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, now_generation, [(layer_file_name_1.clone(), layer_metadata)].to_vec(), @@ -1027,8 +1031,8 @@ mod test { ctx.set_latest_generation(latest_generation); - let tenant_id = ctx.harness.tenant_id; - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let tenant_shard_id = ctx.harness.tenant_shard_id; + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); // Initial state: a remote layer exists @@ -1038,7 +1042,7 @@ mod test { tracing::debug!("Pushing..."); client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, stale_generation, [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), @@ -1053,7 +1057,7 @@ mod test { tracing::debug!("Pushing..."); client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, latest_generation, [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), @@ -1075,9 +1079,9 @@ mod test { let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let tenant_id = ctx.harness.tenant_id; + let tenant_shard_id = ctx.harness.tenant_shard_id; - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); let deletion_prefix = ctx.harness.conf.deletion_prefix(); @@ -1093,7 +1097,7 @@ mod test { ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, now_generation.previous(), [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), @@ -1107,7 +1111,7 @@ mod test { ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, now_generation, [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), @@ -1138,7 +1142,7 @@ mod test { drop(client); ctx.restart().await; let client = ctx.deletion_queue.new_client(); - client.recover(HashMap::from([(tenant_id, now_generation)]))?; + client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?; info!("Flush-executing"); client.flush_execute().await?; @@ -1202,7 +1206,7 @@ pub(crate) mod mock { let mut objects = op.objects; for (layer, meta) in op.layers { objects.push(remote_layer_path( - &op.tenant_id, + &op.tenant_shard_id.tenant_id, &op.timeline_id, meta.shard, &layer, @@ -1293,7 +1297,7 @@ pub(crate) mod mock { fn deletion_list_serialization() -> anyhow::Result<()> { let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c" .to_string() - .parse::()?; + .parse::()?; let timeline_id = "be322c834ed9e709e63b5c9698691910" .to_string() .parse::()?; diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 5d52b680e4..7ff27ceb44 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -19,6 +19,7 @@ use std::collections::HashMap; use std::fs::create_dir_all; use std::time::Duration; +use pageserver_api::shard::TenantShardId; use regex::Regex; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; @@ -26,7 +27,6 @@ use tracing::debug; use tracing::info; use tracing::warn; use utils::generation::Generation; -use utils::id::TenantId; use utils::id::TimelineId; use crate::config::PageServerConf; @@ -54,7 +54,7 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100); #[derive(Debug)] pub(super) struct DeletionOp { - pub(super) tenant_id: TenantId, + pub(super) tenant_shard_id: TenantShardId, pub(super) timeline_id: TimelineId, // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker @@ -62,14 +62,14 @@ pub(super) struct DeletionOp { pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, pub(super) objects: Vec, - /// The _current_ generation of the Tenant attachment in which we are enqueuing + /// The _current_ generation of the Tenant shard attachment in which we are enqueuing /// this deletion. pub(super) generation: Generation, } #[derive(Debug)] pub(super) struct RecoverOp { - pub(super) attached_tenants: HashMap, + pub(super) attached_tenants: HashMap, } #[derive(Debug)] @@ -206,7 +206,7 @@ impl ListWriter { async fn recover( &mut self, - attached_tenants: HashMap, + attached_tenants: HashMap, ) -> Result<(), anyhow::Error> { debug!( "recovering with {} attached tenants", @@ -309,8 +309,8 @@ impl ListWriter { // generation was issued to another node in the interval while we restarted, // then we may treat deletion lists from the previous generation as if they // belong to our currently attached generation, and proceed to validate & execute. - for (tenant_id, tenant_list) in &mut deletion_list.tenants { - if let Some(attached_gen) = attached_tenants.get(tenant_id) { + for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants { + if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) { if attached_gen.previous() == tenant_list.generation { tenant_list.generation = *attached_gen; } @@ -390,7 +390,7 @@ impl ListWriter { let mut layer_paths = Vec::new(); for (layer, meta) in op.layers { layer_paths.push(remote_layer_path( - &op.tenant_id, + &op.tenant_shard_id.tenant_id, &op.timeline_id, meta.shard, &layer, @@ -400,14 +400,14 @@ impl ListWriter { layer_paths.extend(op.objects); if !self.pending.push( - &op.tenant_id, + &op.tenant_shard_id, &op.timeline_id, op.generation, &mut layer_paths, ) { self.flush().await; let retry_succeeded = self.pending.push( - &op.tenant_id, + &op.tenant_shard_id, &op.timeline_id, op.generation, &mut layer_paths, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 642cafad28..f01cd1cf8c 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -310,7 +310,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( .unwrap() .as_micros(), partition, - desc.tenant_id, + desc.tenant_shard_id, desc.timeline_id, candidate.layer, ); @@ -380,7 +380,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size))); for (timeline, batch) in batched { - let tenant_id = timeline.tenant_id; + let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; let batch_size = u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning"); @@ -431,7 +431,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( (evicted_bytes, evictions_failed) } } - .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size)); + .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size)); js.spawn(evict); @@ -572,7 +572,7 @@ async fn collect_eviction_candidates( continue; } let info = tl.get_local_layers_for_disk_usage_eviction().await; - debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); tenant_candidates.extend( info.resident_layers .into_iter() diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5ce09500ee..9cb411c95c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -356,7 +356,8 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); let info = TimelineInfo { - tenant_id: timeline.tenant_id, + // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id + tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, ancestor_timeline_id, ancestor_lsn, diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d5915f4c98..b178024b9a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -7,6 +7,7 @@ use metrics::{ HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; +use pageserver_api::shard::TenantShardId; use strum::{EnumCount, IntoEnumIterator, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; use utils::id::{TenantId, TimelineId}; @@ -1571,9 +1572,9 @@ pub struct RemoteTimelineClientMetrics { } impl RemoteTimelineClientMetrics { - pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { RemoteTimelineClientMetrics { - tenant_id: tenant_id.to_string(), + tenant_id: tenant_shard_id.tenant_id.to_string(), timeline_id: timeline_id.to_string(), calls_unfinished_gauge: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ee5f1732e4..82c16eb9bd 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -399,6 +399,9 @@ impl PageServerHandler { { debug_assert_current_span_has_tenant_and_timeline_id(); + // TODO(sharding): enumerate local tenant shards for this tenant, and select the one + // that should serve this request. + // Make request tracer if needed let tenant = mgr::get_active_tenant_with_timeout( tenant_id, @@ -408,9 +411,10 @@ impl PageServerHandler { .await?; let mut tracer = if tenant.get_trace_read_requests() { let connection_id = ConnectionId::generate(); - let path = tenant - .conf - .trace_path(&tenant_id, &timeline_id, &connection_id); + let path = + tenant + .conf + .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id); Some(Tracer::new(path)) } else { None diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index bc8779b26f..fe3a5bfb79 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -17,6 +17,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use enumset::EnumSet; use futures::FutureExt; use pageserver_api::models::TimelineState; +use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use std::fmt; @@ -228,7 +229,7 @@ pub struct Tenant { // This is necessary to allow global config updates. tenant_conf: Arc>, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, /// The remote storage generation, used to protect S3 objects from split-brain. /// Does not change over the lifetime of the [`Tenant`] object. @@ -272,7 +273,7 @@ pub struct Tenant { impl std::fmt::Debug for Tenant { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{} ({})", self.tenant_id, self.current_state()) + write!(f, "{} ({})", self.tenant_shard_id, self.current_state()) } } @@ -466,7 +467,7 @@ impl Tenant { init_order: Option<&InitializationOrder>, _ctx: &RequestContext, ) -> anyhow::Result<()> { - let tenant_id = self.tenant_id; + let tenant_id = self.tenant_shard_id; let timeline = self.create_timeline_struct( timeline_id, @@ -558,7 +559,7 @@ impl Tenant { #[allow(clippy::too_many_arguments)] pub(crate) fn spawn( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, resources: TenantSharedResources, attached_conf: AttachedTenantConf, init_order: Option, @@ -566,8 +567,10 @@ impl Tenant { mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result> { + // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, tenant_id, + conf, + tenant_shard_id.tenant_id, ))); let TenantSharedResources { @@ -581,7 +584,7 @@ impl Tenant { conf, attached_conf, wal_redo_manager, - tenant_id, + tenant_shard_id, remote_storage.clone(), deletion_queue_client, )); @@ -593,7 +596,7 @@ impl Tenant { task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, - Some(tenant_id), + Some(tenant_shard_id.tenant_id), None, "attach tenant", false, @@ -632,7 +635,7 @@ impl Tenant { match tenant_clone .preload(remote_storage, task_mgr::shutdown_token()) .instrument( - tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_id), + tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), ) .await { Ok(p) => p, @@ -714,7 +717,7 @@ impl Tenant { Ok(()) } .instrument({ - let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_id); + let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); span.follows_from(Span::current()); span }), @@ -732,7 +735,7 @@ impl Tenant { info!("listing remote timelines"); let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( remote_storage, - self.tenant_id, + self.tenant_shard_id, cancel.clone(), ) .await?; @@ -844,7 +847,7 @@ impl Tenant { .with_context(|| { format!( "failed to load remote timeline {} for tenant {}", - timeline_id, self.tenant_id + timeline_id, self.tenant_shard_id ) })?; } @@ -884,7 +887,7 @@ impl Tenant { /// timeline that still exists: this can happen if we crashed during a deletion/creation, or /// if a timeline was deleted while the tenant was attached to a different pageserver. fn clean_up_timelines(&self, existent_timelines: &HashSet) -> anyhow::Result<()> { - let timelines_dir = self.conf.timelines_path(&self.tenant_id); + let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); let entries = match timelines_dir.read_dir_utf8() { Ok(d) => d, @@ -970,7 +973,7 @@ impl Tenant { span::debug_assert_current_span_has_tenant_id(); info!("downloading index file for timeline {}", timeline_id); - tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_id, &timeline_id)) + tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_shard_id, &timeline_id)) .await .context("Failed to create new timeline directory")?; @@ -992,10 +995,15 @@ impl Tenant { let init_order = None; // timeline loading after attach expects to find metadata file for each metadata - save_metadata(self.conf, &self.tenant_id, &timeline_id, &remote_metadata) - .await - .context("save_metadata") - .map_err(LoadLocalTimelineError::Load)?; + save_metadata( + self.conf, + &self.tenant_shard_id, + &timeline_id, + &remote_metadata, + ) + .await + .context("save_metadata") + .map_err(LoadLocalTimelineError::Load)?; self.timeline_init_and_sync( timeline_id, @@ -1012,11 +1020,13 @@ impl Tenant { /// Create a placeholder Tenant object for a broken tenant pub fn create_broken_tenant( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, reason: String, ) -> Arc { + // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, tenant_id, + conf, + tenant_shard_id.tenant_id, ))); Arc::new(Tenant::new( TenantState::Broken { @@ -1026,7 +1036,7 @@ impl Tenant { conf, AttachedTenantConf::try_from(LocationConf::default()).unwrap(), wal_redo_manager, - tenant_id, + tenant_shard_id, None, DeletionQueueClient::broken(), )) @@ -1039,7 +1049,7 @@ impl Tenant { // completed in non topological order (for example because parent has smaller number of layer files in it) let mut timelines_to_resume_deletion: Vec<(TimelineId, Option)> = vec![]; - let timelines_dir = self.conf.timelines_path(&self.tenant_id); + let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); for entry in timelines_dir .read_dir_utf8() @@ -1070,7 +1080,7 @@ impl Tenant { "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}", ) })?; - let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id); + let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id); if let Err(e) = remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) { @@ -1087,7 +1097,7 @@ impl Tenant { info!("Found deletion mark for timeline {}", timeline_id); - match load_metadata(self.conf, &self.tenant_id, &timeline_id) { + match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) { Ok(metadata) => { timelines_to_resume_deletion.push((timeline_id, Some(metadata))) } @@ -1131,7 +1141,7 @@ impl Tenant { })?; let timeline_uninit_mark_file = self .conf - .timeline_uninit_mark_file_path(self.tenant_id, timeline_id); + .timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id); if timeline_uninit_mark_file.exists() { info!( %timeline_id, @@ -1147,7 +1157,7 @@ impl Tenant { let timeline_delete_mark_file = self .conf - .timeline_delete_mark_file_path(self.tenant_id, timeline_id); + .timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id); if timeline_delete_mark_file.exists() { // Cleanup should be done in `is_delete_mark` branch above continue; @@ -1155,7 +1165,7 @@ impl Tenant { let file_name = entry.file_name(); if let Ok(timeline_id) = file_name.parse::() { - let metadata = load_metadata(self.conf, &self.tenant_id, &timeline_id) + let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) .context("failed to load metadata")?; timelines_to_load.insert(timeline_id, metadata); } else { @@ -1187,7 +1197,7 @@ impl Tenant { remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, - self.tenant_id, + self.tenant_shard_id, timeline_id, self.generation, ); @@ -1393,8 +1403,12 @@ impl Tenant { .map_err(LoadLocalTimelineError::Load) } - pub fn tenant_id(&self) -> TenantId { - self.tenant_id + pub(crate) fn tenant_id(&self) -> TenantId { + self.tenant_shard_id.tenant_id + } + + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { + self.tenant_shard_id } /// Get Timeline handle for given Neon timeline ID. @@ -1408,13 +1422,13 @@ impl Tenant { let timeline = timelines_accessor .get(&timeline_id) .ok_or(GetTimelineError::NotFound { - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id, })?; if active_only && !timeline.is_active() { Err(GetTimelineError::NotActive { - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id, state: timeline.current_state(), }) @@ -1772,7 +1786,7 @@ impl Tenant { *current_state = TenantState::Activating(ActivatingFrom::Attaching); } } - debug!(tenant_id = %self.tenant_id, "Activating tenant"); + debug!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), "Activating tenant"); activating = true; // Continue outside the closure. We need to grab timelines.lock() // and we plan to turn it into a tokio::sync::Mutex in a future patch. @@ -1809,7 +1823,8 @@ impl Tenant { // times to activate. see https://github.com/neondatabase/neon/issues/4025 info!( since_creation_millis = elapsed.as_millis(), - tenant_id = %self.tenant_id, + tenant_id = %self.tenant_shard_id.tenant_id, + shard_id = %self.tenant_shard_id.shard_slug(), activated_timelines, total_timelines, post_state = <&'static str>::from(&*current_state), @@ -1906,7 +1921,7 @@ impl Tenant { // // this will additionally shutdown and await all timeline tasks. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await; // Wait for any in-flight operations to complete self.gate.close().await; @@ -2081,7 +2096,7 @@ impl Tenant { receiver.changed().await.map_err( |_e: tokio::sync::watch::error::RecvError| // Tenant existed but was dropped: report it as non-existent - GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id)) + GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) )?; } TenantState::Active { .. } => { @@ -2155,9 +2170,6 @@ where } impl Tenant { - pub fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } pub fn tenant_specific_overrides(&self) -> TenantConfOpt { self.tenant_conf.read().unwrap().tenant_conf } @@ -2307,7 +2319,7 @@ impl Tenant { new_metadata, ancestor, new_timeline_id, - self.tenant_id, + self.tenant_shard_id, self.generation, Arc::clone(&self.walredo_mgr), resources, @@ -2329,14 +2341,14 @@ impl Tenant { conf: &'static PageServerConf, attached_conf: AttachedTenantConf, walredo_mgr: Arc, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, remote_storage: Option, deletion_queue_client: DeletionQueueClient, ) -> Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - let tid = tenant_id.to_string(); + let tid = tenant_shard_id.to_string(); fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { ([state.into()], matches!(state, TenantState::Broken { .. })) @@ -2388,7 +2400,7 @@ impl Tenant { }); Tenant { - tenant_id, + tenant_shard_id, generation: attached_conf.location.generation, conf, // using now here is good enough approximation to catch tenants with really long @@ -2406,17 +2418,17 @@ impl Tenant { eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), - gate: Gate::new(format!("Tenant<{tenant_id}>")), + gate: Gate::new(format!("Tenant<{tenant_shard_id}>")), } } /// Locate and load config pub(super) fn load_tenant_config( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> anyhow::Result { - let legacy_config_path = conf.tenant_config_path(tenant_id); - let config_path = conf.tenant_location_config_path(tenant_id); + let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + let config_path = conf.tenant_location_config_path(tenant_shard_id); if config_path.exists() { // New-style config takes precedence @@ -2470,29 +2482,34 @@ impl Tenant { .with_context(|| format!("Failed to parse config from file '{path}' as toml file")) } - #[tracing::instrument(skip_all, fields(%tenant_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, location_conf: &LocationConf, ) -> anyhow::Result<()> { - let legacy_config_path = conf.tenant_config_path(tenant_id); - let config_path = conf.tenant_location_config_path(tenant_id); + let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + let config_path = conf.tenant_location_config_path(tenant_shard_id); - Self::persist_tenant_config_at(tenant_id, &config_path, &legacy_config_path, location_conf) - .await + Self::persist_tenant_config_at( + tenant_shard_id, + &config_path, + &legacy_config_path, + location_conf, + ) + .await } - #[tracing::instrument(skip_all, fields(%tenant_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config_at( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, config_path: &Utf8Path, legacy_config_path: &Utf8Path, location_conf: &LocationConf, ) -> anyhow::Result<()> { // Forward compat: write out an old-style configuration that old versions can read, in case we roll back Self::persist_tenant_config_legacy( - tenant_id, + tenant_shard_id, legacy_config_path, &location_conf.tenant_conf, ) @@ -2519,14 +2536,16 @@ impl Tenant { let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); - let tenant_id = *tenant_id; + let tenant_shard_id = *tenant_shard_id; let config_path = config_path.to_owned(); tokio::task::spawn_blocking(move || { Handle::current().block_on(async move { let conf_content = conf_content.as_bytes(); VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content) .await - .with_context(|| format!("write tenant {tenant_id} config to {config_path}")) + .with_context(|| { + format!("write tenant {tenant_shard_id} config to {config_path}") + }) }) }) .await??; @@ -2534,9 +2553,9 @@ impl Tenant { Ok(()) } - #[tracing::instrument(skip_all, fields(%tenant_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] async fn persist_tenant_config_legacy( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, target_config_path: &Utf8Path, tenant_conf: &TenantConfOpt, ) -> anyhow::Result<()> { @@ -2554,7 +2573,7 @@ impl Tenant { let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX); - let tenant_id = *tenant_id; + let tenant_shard_id = *tenant_shard_id; let target_config_path = target_config_path.to_owned(); tokio::task::spawn_blocking(move || { Handle::current().block_on(async move { @@ -2562,7 +2581,7 @@ impl Tenant { VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content) .await .with_context(|| { - format!("write tenant {tenant_id} config to {target_config_path}") + format!("write tenant {tenant_shard_id} config to {target_config_path}") }) }) }) @@ -2940,7 +2959,7 @@ impl Tenant { // temporary directory for basebackup files for the given timeline. let pgdata_path = path_with_suffix_extension( self.conf - .timelines_path(&self.tenant_id) + .timelines_path(&self.tenant_shard_id) .join(format!("basebackup-{timeline_id}")), TEMP_FILE_SUFFIX, ); @@ -2971,7 +2990,7 @@ impl Tenant { || async { self::remote_timeline_client::upload_initdb_dir( storage, - &self.tenant_id, + &self.tenant_shard_id.tenant_id, &timeline_id, pgdata_zstd.clone(), ) @@ -3010,7 +3029,7 @@ impl Tenant { ) .await?; - let tenant_id = raw_timeline.owning_tenant.tenant_id; + let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; let unfinished_timeline = raw_timeline.raw_timeline()?; import_datadir::import_timeline_from_postgres_datadir( @@ -3021,7 +3040,7 @@ impl Tenant { ) .await .with_context(|| { - format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") + format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") })?; // Flush the new layer files to disk, before we make the timeline as available to @@ -3039,7 +3058,7 @@ impl Tenant { .await .with_context(|| { format!( - "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}" + "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}" ) })?; @@ -3062,7 +3081,7 @@ impl Tenant { remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, - self.tenant_id, + self.tenant_shard_id, timeline_id, self.generation, ); @@ -3091,7 +3110,7 @@ impl Tenant { start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result { - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); if let Some(remote_client) = &resources.remote_client { @@ -3115,12 +3134,14 @@ impl Tenant { .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) .await { - error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); + error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); cleanup_timeline_directory(uninit_mark); return Err(e); } - debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}"); + debug!( + "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}" + ); Ok(UninitializedTimeline::new( self, @@ -3141,9 +3162,14 @@ impl Tenant { anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); }); - save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata) - .await - .context("Failed to create timeline metadata")?; + save_metadata( + self.conf, + &self.tenant_shard_id, + new_timeline_id, + new_metadata, + ) + .await + .context("Failed to create timeline metadata")?; Ok(()) } @@ -3156,13 +3182,13 @@ impl Tenant { timeline_id: TimelineId, timelines: &MutexGuard>>, ) -> anyhow::Result { - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_shard_id; anyhow::ensure!( timelines.get(&timeline_id).is_none(), - "Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory" + "Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory" ); - let timeline_path = self.conf.timeline_path(&tenant_id, &timeline_id); + let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); anyhow::ensure!( !timeline_path.exists(), "Timeline {timeline_path} already exists, cannot create its uninit mark file", @@ -3170,7 +3196,7 @@ impl Tenant { let uninit_mark_path = self .conf - .timeline_uninit_mark_file_path(tenant_id, timeline_id); + .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id); fs::File::create(&uninit_mark_path) .context("Failed to create uninit mark file") .and_then(|_| { @@ -3178,7 +3204,7 @@ impl Tenant { .context("Failed to fsync uninit mark file") }) .with_context(|| { - format!("Failed to crate uninit mark for timeline {tenant_id}/{timeline_id}") + format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}") })?; let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); @@ -3189,7 +3215,7 @@ impl Tenant { /// Gathers inputs from all of the timelines to produce a sizing model input. /// /// Future is cancellation safe. Only one calculation can be running at once per tenant. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub async fn gather_size_inputs( &self, // `max_retention_period` overrides the cutoff that is used to calculate the size @@ -3228,7 +3254,7 @@ impl Tenant { /// Calculate synthetic tenant size and cache the result. /// This is periodically called by background worker. /// result is cached in tenant struct - #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub async fn calculate_synthetic_size( &self, cause: LogicalSizeCalculationCause, @@ -3250,7 +3276,7 @@ impl Tenant { .store(size, Ordering::Relaxed); TENANT_SYNTHETIC_SIZE_METRIC - .get_metric_with_label_values(&[&self.tenant_id.to_string()]) + .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) .unwrap() .set(size); } @@ -3286,9 +3312,9 @@ fn remove_timeline_and_uninit_mark( pub(crate) async fn create_tenant_files( conf: &'static PageServerConf, location_conf: &LocationConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> anyhow::Result { - let target_tenant_directory = conf.tenant_path(tenant_id); + let target_tenant_directory = conf.tenant_path(tenant_shard_id); anyhow::ensure!( !target_tenant_directory .try_exists() @@ -3308,14 +3334,16 @@ pub(crate) async fn create_tenant_files( let creation_result = try_create_target_tenant_dir( conf, location_conf, - tenant_id, + tenant_shard_id, &temporary_tenant_dir, &target_tenant_directory, ) .await; if creation_result.is_err() { - error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data"); + error!( + "Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data" + ); if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { @@ -3333,31 +3361,31 @@ pub(crate) async fn create_tenant_files( async fn try_create_target_tenant_dir( conf: &'static PageServerConf, location_conf: &LocationConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, temporary_tenant_dir: &Utf8Path, target_tenant_directory: &Utf8Path, ) -> Result<(), anyhow::Error> { let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(tenant_id), + &conf.timelines_path(tenant_shard_id), target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?; + .with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?; let temporary_legacy_tenant_config_path = rebase_directory( - &conf.tenant_config_path(tenant_id), + &conf.tenant_config_path(tenant_shard_id), target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; + .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; let temporary_tenant_config_path = rebase_directory( - &conf.tenant_location_config_path(tenant_id), + &conf.tenant_location_config_path(tenant_shard_id), target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; + .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; Tenant::persist_tenant_config_at( - tenant_id, + tenant_shard_id, &temporary_tenant_config_path, &temporary_legacy_tenant_config_path, location_conf, @@ -3367,7 +3395,7 @@ async fn try_create_target_tenant_dir( crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( "create tenant {} temporary timelines directory {}", - tenant_id, temporary_tenant_timelines_dir, + tenant_shard_id, temporary_tenant_timelines_dir, ) })?; fail::fail_point!("tenant-creation-before-tmp-rename", |_| { @@ -3382,19 +3410,19 @@ async fn try_create_target_tenant_dir( fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( "move tenant {} temporary directory {} into the permanent one {}", - tenant_id, temporary_tenant_dir, target_tenant_directory + tenant_shard_id, temporary_tenant_dir, target_tenant_directory ) })?; let target_dir_parent = target_tenant_directory.parent().with_context(|| { format!( "get tenant {} dir parent for {}", - tenant_id, target_tenant_directory, + tenant_shard_id, target_tenant_directory, ) })?; crashsafe::fsync(target_dir_parent).with_context(|| { format!( "fsync renamed directory's parent {} for tenant {}", - target_dir_parent, tenant_id, + target_dir_parent, tenant_shard_id, ) })?; @@ -3472,7 +3500,7 @@ async fn run_initdb( impl Drop for Tenant { fn drop(&mut self) { - remove_tenant_metrics(&self.tenant_id); + remove_tenant_metrics(&self.tenant_shard_id.tenant_id); } } /// Dump contents of a layer file to stdout. @@ -3575,7 +3603,9 @@ pub(crate) mod harness { pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, - pub tenant_id: TenantId, + // TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id + pub(crate) tenant_id: TenantId, + pub tenant_shard_id: TenantShardId, pub generation: Generation, pub shard: ShardIndex, pub remote_storage: GenericRemoteStorage, @@ -3620,8 +3650,9 @@ pub(crate) mod harness { }; let tenant_id = TenantId::generate(); - fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.timelines_path(&tenant_id))?; + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; + fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; let remote_fs_dir = conf.workdir.join("localfs"); @@ -3636,6 +3667,7 @@ pub(crate) mod harness { conf, tenant_conf, tenant_id, + tenant_shard_id, generation: Generation::new(0xdeadbeef), shard: ShardIndex::unsharded(), remote_storage, @@ -3655,7 +3687,7 @@ pub(crate) mod harness { } fn remote_empty(&self) -> bool { - let tenant_path = self.conf.tenant_path(&self.tenant_id); + let tenant_path = self.conf.tenant_path(&self.tenant_shard_id); let remote_tenant_dir = self .remote_fs_dir .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap()); @@ -3695,7 +3727,7 @@ pub(crate) mod harness { )) .unwrap(), walredo_mgr, - self.tenant_id, + self.tenant_shard_id, Some(self.remote_storage.clone()), self.deletion_queue.new_client(), )); @@ -3704,17 +3736,17 @@ pub(crate) mod harness { LoadMode::Local => { tenant .load_local(None, ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) + .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; } LoadMode::Remote => { let preload = tenant .preload(&self.remote_storage, CancellationToken::new()) - .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_id)) + .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; tenant .attach(None, Some(preload), ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) + .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; } } @@ -3748,7 +3780,7 @@ pub(crate) mod harness { } pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { - self.conf.timeline_path(&self.tenant_id, timeline_id) + self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } } @@ -3864,7 +3896,7 @@ mod tests { e.to_string(), format!( "Timeline {}/{} already exists in pageserver's memory", - tenant.tenant_id, TIMELINE_ID + tenant.tenant_shard_id, TIMELINE_ID ) ), } @@ -4248,7 +4280,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) .await .ok() .unwrap(); @@ -4289,7 +4321,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) .await .ok() .unwrap(); @@ -4351,7 +4383,7 @@ mod tests { // so that all uploads finish & we can call harness.try_load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) .await .ok() .unwrap(); @@ -4884,7 +4916,7 @@ mod tests { let raw_tline = tline.raw_timeline().unwrap(); raw_tline .shutdown() - .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id)) .await; std::mem::forget(tline); } @@ -4896,7 +4928,7 @@ mod tests { assert_eq!( e, GetTimelineError::NotFound { - tenant_id: tenant.tenant_id, + tenant_id: tenant.tenant_shard_id.tenant_id, timeline_id: TIMELINE_ID, } ) @@ -4905,12 +4937,12 @@ mod tests { assert!(!harness .conf - .timeline_path(&tenant.tenant_id, &TIMELINE_ID) + .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) .exists()); assert!(!harness .conf - .timeline_uninit_mark_file_path(tenant.tenant_id, TIMELINE_ID) + .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID) .exists()); Ok(()) diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 6289d40b93..b7b2ef9c79 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -2,16 +2,13 @@ use std::sync::Arc; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::models::TenantState; +use pageserver_api::{models::TenantState, shard::TenantShardId}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::sync::OwnedMutexGuard; use tokio_util::sync::CancellationToken; use tracing::{error, instrument, Instrument, Span}; -use utils::{ - backoff, completion, crashsafe, fs_ext, - id::{TenantId, TimelineId}, -}; +use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId}; use crate::{ config::PageServerConf, @@ -60,10 +57,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard; fn remote_tenant_delete_mark_path( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> anyhow::Result { let tenant_remote_path = conf - .tenant_path(tenant_id) + .tenant_path(tenant_shard_id) .strip_prefix(&conf.workdir) .context("Failed to strip workdir prefix") .and_then(RemotePath::new) @@ -74,9 +71,9 @@ fn remote_tenant_delete_mark_path( async fn create_remote_delete_mark( conf: &PageServerConf, remote_storage: &GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { - let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?; + let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; let data: &[u8] = &[]; backoff::retry( @@ -100,9 +97,9 @@ async fn create_remote_delete_mark( async fn create_local_delete_mark( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { - let marker_path = conf.tenant_deleted_mark_file_path(tenant_id); + let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id); // Note: we're ok to replace existing file. let _ = std::fs::OpenOptions::new() @@ -171,10 +168,10 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, remote_storage: Option<&GenericRemoteStorage>, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { if let Some(remote_storage) = remote_storage { - let path = remote_tenant_delete_mark_path(conf, tenant_id)?; + let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; backoff::retry( || async { remote_storage.delete(&path).await }, |_e| false, @@ -193,7 +190,7 @@ async fn remove_tenant_remote_delete_mark( // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir async fn cleanup_remaining_fs_traces( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { let rm = |p: Utf8PathBuf, is_dir: bool| async move { if is_dir { @@ -205,8 +202,8 @@ async fn cleanup_remaining_fs_traces( .with_context(|| format!("failed to delete {p}")) }; - rm(conf.tenant_config_path(tenant_id), false).await?; - rm(conf.tenant_location_config_path(tenant_id), false).await?; + rm(conf.tenant_config_path(tenant_shard_id), false).await?; + rm(conf.tenant_location_config_path(tenant_shard_id), false).await?; fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| { Err(anyhow::anyhow!( @@ -214,7 +211,7 @@ async fn cleanup_remaining_fs_traces( ))? }); - rm(conf.timelines_path(tenant_id), true).await?; + rm(conf.timelines_path(tenant_shard_id), true).await?; fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| { Err(anyhow::anyhow!( @@ -228,14 +225,14 @@ async fn cleanup_remaining_fs_traces( // to be reordered later and thus missed if a crash occurs. // Note that we dont need to sync after mark file is removed // because we can tolerate the case when mark file reappears on startup. - let tenant_path = &conf.tenant_path(tenant_id); + let tenant_path = &conf.tenant_path(tenant_shard_id); if tenant_path.exists() { - crashsafe::fsync_async(&conf.tenant_path(tenant_id)) + crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id)) .await .context("fsync_pre_mark_remove")?; } - rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?; + rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?; fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { Err(anyhow::anyhow!( @@ -243,7 +240,7 @@ async fn cleanup_remaining_fs_traces( ))? }); - rm(conf.tenant_path(tenant_id), true).await?; + rm(conf.tenant_path(tenant_shard_id), true).await?; Ok(()) } @@ -324,7 +321,7 @@ impl DeleteTenantFlow { // Though sounds scary, different mark name? // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id) + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id) .await .context("remote_mark")? } @@ -335,7 +332,7 @@ impl DeleteTenantFlow { ))? }); - create_local_delete_mark(conf, &tenant.tenant_id) + create_local_delete_mark(conf, &tenant.tenant_shard_id) .await .context("local delete mark")?; @@ -377,9 +374,11 @@ impl DeleteTenantFlow { return Ok(acquire(tenant)); } - let tenant_id = tenant.tenant_id; // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. - if conf.tenant_deleted_mark_file_path(&tenant_id).exists() { + if conf + .tenant_deleted_mark_file_path(&tenant.tenant_shard_id) + .exists() + { Ok(acquire(tenant)) } else { Ok(None) @@ -462,12 +461,12 @@ impl DeleteTenantFlow { tenants: &'static std::sync::RwLock, tenant: Arc, ) { - let tenant_id = tenant.tenant_id; + let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_id), + Some(tenant_shard_id.tenant_id), None, "tenant_delete", false, @@ -481,7 +480,7 @@ impl DeleteTenantFlow { Ok(()) } .instrument({ - let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id); + let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); span.follows_from(Span::current()); span }), @@ -519,7 +518,7 @@ impl DeleteTenantFlow { } } - let timelines_path = conf.timelines_path(&tenant.tenant_id); + let timelines_path = conf.timelines_path(&tenant.tenant_shard_id); // May not exist if we fail in cleanup_remaining_fs_traces after removing it if timelines_path.exists() { // sanity check to guard against layout changes @@ -528,7 +527,8 @@ impl DeleteTenantFlow { .context("timelines dir not empty")?; } - remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?; + remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id) + .await?; fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { Err(anyhow::anyhow!( @@ -536,7 +536,7 @@ impl DeleteTenantFlow { ))? }); - cleanup_remaining_fs_traces(conf, &tenant.tenant_id) + cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id) .await .context("cleanup_remaining_fs_traces")?; @@ -553,7 +553,7 @@ impl DeleteTenantFlow { // we encounter an InProgress marker, yield the barrier it contains and wait on it. let barrier = { let mut locked = tenants.write().unwrap(); - let removed = locked.remove(&tenant.tenant_id); + let removed = locked.remove(&tenant.tenant_shard_id.tenant_id); // FIXME: we should not be modifying this from outside of mgr.rs. // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 9a06d9df61..591eacd104 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -7,18 +7,19 @@ use crate::page_cache::{self, PAGE_SZ}; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; use crate::virtual_file::VirtualFile; use camino::Utf8PathBuf; +use pageserver_api::shard::TenantShardId; use std::cmp::min; use std::fs::OpenOptions; use std::io::{self, ErrorKind}; use std::ops::DerefMut; use std::sync::atomic::AtomicU64; use tracing::*; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; pub struct EphemeralFile { page_cache_file_id: page_cache::FileId, - _tenant_id: TenantId, + _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, file: VirtualFile, len: u64, @@ -31,7 +32,7 @@ pub struct EphemeralFile { impl EphemeralFile { pub async fn create( conf: &PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); @@ -39,7 +40,7 @@ impl EphemeralFile { NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed); let filename = conf - .timeline_path(&tenant_id, &timeline_id) + .timeline_path(&tenant_shard_id, &timeline_id) .join(Utf8PathBuf::from(format!( "ephemeral-{filename_disambiguator}" ))); @@ -52,7 +53,7 @@ impl EphemeralFile { Ok(EphemeralFile { page_cache_file_id: page_cache::next_file_id(), - _tenant_id: tenant_id, + _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, file, len: 0, @@ -282,7 +283,7 @@ mod tests { ) -> Result< ( &'static PageServerConf, - TenantId, + TenantShardId, TimelineId, RequestContext, ), @@ -295,13 +296,13 @@ mod tests { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap(); + let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap(); let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); - fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?; + fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?; let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); - Ok((conf, tenant_id, timeline_id, ctx)) + Ok((conf, tenant_shard_id, timeline_id, ctx)) } #[tokio::test] diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 38fd426746..6fb86c65e2 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -11,15 +11,12 @@ use std::io::{self}; use anyhow::{ensure, Context}; +use pageserver_api::shard::TenantShardId; use serde::{de::Error, Deserialize, Serialize, Serializer}; use thiserror::Error; use utils::bin_ser::SerializeError; use utils::crashsafe::path_with_suffix_extension; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; use crate::config::PageServerConf; use crate::virtual_file::VirtualFile; @@ -272,14 +269,14 @@ impl Serialize for TimelineMetadata { } /// Save timeline metadata to file -#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))] +#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))] pub async fn save_metadata( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, data: &TimelineMetadata, ) -> anyhow::Result<()> { - let path = conf.metadata_path(tenant_id, timeline_id); + let path = conf.metadata_path(tenant_shard_id, timeline_id); let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX); let metadata_bytes = data.to_bytes().context("serialize metadata")?; VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes) @@ -299,10 +296,10 @@ pub enum LoadMetadataError { pub fn load_metadata( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Result { - let metadata_path = conf.metadata_path(tenant_id, timeline_id); + let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id); let metadata_bytes = std::fs::read(metadata_path)?; Ok(TimelineMetadata::from_bytes(&metadata_bytes)?) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 52d697a878..e94d29327e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -272,8 +272,8 @@ pub struct TenantManager { } fn emergency_generations( - tenant_confs: &HashMap>, -) -> HashMap { + tenant_confs: &HashMap>, +) -> HashMap { tenant_confs .iter() .filter_map(|(tid, lc)| { @@ -293,10 +293,10 @@ fn emergency_generations( async fn init_load_generations( conf: &'static PageServerConf, - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let generations = if conf.control_plane_emergency_mode { error!( "Emergency mode! Tenants will be attached unsafely using their last known generation" @@ -339,7 +339,7 @@ async fn init_load_generations( fn load_tenant_config( conf: &'static PageServerConf, dentry: Utf8DirEntry, -) -> anyhow::Result)>> { +) -> anyhow::Result)>> { let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); @@ -375,10 +375,10 @@ fn load_tenant_config( return Ok(None); } - let tenant_id = match tenant_dir_path + let tenant_shard_id = match tenant_dir_path .file_name() .unwrap_or_default() - .parse::() + .parse::() { Ok(id) => id, Err(_) => { @@ -388,8 +388,8 @@ fn load_tenant_config( }; Ok(Some(( - tenant_id, - Tenant::load_tenant_config(conf, &tenant_id), + tenant_shard_id, + Tenant::load_tenant_config(conf, &tenant_shard_id), ))) } @@ -400,7 +400,7 @@ fn load_tenant_config( /// seconds even on reasonably fast drives. async fn init_load_tenant_configs( conf: &'static PageServerConf, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let tenants_dir = conf.tenants_path(); let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result> { @@ -450,19 +450,19 @@ pub async fn init_tenant_mgr( init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; // Construct `Tenant` objects and start them running - for (tenant_id, location_conf) in tenant_configs { - let tenant_dir_path = conf.tenant_path(&tenant_id); + for (tenant_shard_id, location_conf) in tenant_configs { + let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let mut location_conf = match location_conf { Ok(l) => l, Err(e) => { - warn!(%tenant_id, "Marking tenant broken, failed to {e:#}"); + warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}"); tenants.insert( - TenantShardId::unsharded(tenant_id), + tenant_shard_id, TenantSlot::Attached(Tenant::create_broken_tenant( conf, - tenant_id, + tenant_shard_id, format!("{}", e), )), ); @@ -473,7 +473,7 @@ pub async fn init_tenant_mgr( let generation = if let Some(generations) = &tenant_generations { // We have a generation map: treat it as the authority for whether // this tenant is really attached. - if let Some(gen) = generations.get(&tenant_id) { + if let Some(gen) = generations.get(&tenant_shard_id) { *gen } else { match &location_conf.mode { @@ -481,8 +481,8 @@ pub async fn init_tenant_mgr( // We do not require the control plane's permission for secondary mode // tenants, because they do no remote writes and hence require no // generation number - info!(%tenant_id, "Loaded tenant in secondary mode"); - tenants.insert(TenantShardId::unsharded(tenant_id), TenantSlot::Secondary); + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); + tenants.insert(tenant_shard_id, TenantSlot::Secondary); } LocationMode::Attached(_) => { // TODO: augment re-attach API to enable the control plane to @@ -490,9 +490,9 @@ pub async fn init_tenant_mgr( // away local state, we can gracefully fall back to secondary here, if the control // plane tells us so. // (https://github.com/neondatabase/neon/issues/5377) - info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response"); + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(%tenant_id, + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", ); } @@ -504,18 +504,18 @@ pub async fn init_tenant_mgr( } else { // Legacy mode: no generation information, any tenant present // on local disk may activate - info!(%tenant_id, "Starting tenant in legacy mode, no generation",); + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",); Generation::none() }; // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. location_conf.attach_in_generation(generation); - Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; + Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; match tenant_spawn( conf, - tenant_id, + tenant_shard_id, &tenant_dir_path, resources.clone(), AttachedTenantConf::try_from(location_conf)?, @@ -531,7 +531,7 @@ pub async fn init_tenant_mgr( ); } Err(e) => { - error!(%tenant_id, "Failed to start tenant: {e:#}"); + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); } } } @@ -555,7 +555,7 @@ pub async fn init_tenant_mgr( #[allow(clippy::too_many_arguments)] pub(crate) fn tenant_spawn( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, tenant_path: &Utf8Path, resources: TenantSharedResources, location_conf: AttachedTenantConf, @@ -579,16 +579,16 @@ pub(crate) fn tenant_spawn( "Cannot load tenant from empty directory {tenant_path:?}" ); - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); anyhow::ensure!( - !conf.tenant_ignore_mark_file_path(&tenant_id).exists(), + !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(), "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); - info!("Attaching tenant {tenant_id}"); + info!("Attaching tenant {tenant_shard_id}"); let tenant = match Tenant::spawn( conf, - tenant_id, + tenant_shard_id, resources, location_conf, init_order, @@ -598,8 +598,8 @@ pub(crate) fn tenant_spawn( ) { Ok(tenant) => tenant, Err(e) => { - error!("Failed to spawn tenant {tenant_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) + error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}"); + Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}")) } }; @@ -757,13 +757,11 @@ pub(crate) async fn create_tenant( let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - // TODO(sharding): make local paths shard-aware - let tenant_path = - super::create_tenant_files(conf, &location_conf, &tenant_shard_id.tenant_id).await?; + let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; let created_tenant = tenant_spawn( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, @@ -803,8 +801,9 @@ pub(crate) async fn set_new_tenant_config( // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); - Tenant::persist_tenant_config(conf, &tenant_id, &location_conf) + Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) .await .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf); @@ -935,8 +934,7 @@ impl TenantManager { slot_guard.drop_old_value().expect("We just shut it down"); } - // TODO(sharding): make local paths sharding-aware - let tenant_path = self.conf.tenant_path(&tenant_shard_id.tenant_id); + let tenant_path = self.conf.tenant_path(&tenant_shard_id); let new_slot = match &new_location_config.mode { LocationMode::Secondary(_) => { @@ -946,20 +944,14 @@ impl TenantManager { .await .with_context(|| format!("Creating {tenant_path}"))?; - // TODO(sharding): make local paths sharding-aware - Tenant::persist_tenant_config( - self.conf, - &tenant_shard_id.tenant_id, - &new_location_config, - ) - .await - .map_err(SetNewTenantConfigError::Persist)?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; TenantSlot::Secondary } LocationMode::Attached(_attach_config) => { - // TODO(sharding): make local paths sharding-aware - let timelines_path = self.conf.timelines_path(&tenant_shard_id.tenant_id); + let timelines_path = self.conf.timelines_path(&tenant_shard_id); // Directory doesn't need to be fsync'd because we do not depend on // it to exist after crashes: it may be recreated when tenant is @@ -968,19 +960,13 @@ impl TenantManager { .await .with_context(|| format!("Creating {timelines_path}"))?; - // TODO(sharding): make local paths sharding-aware - Tenant::persist_tenant_config( - self.conf, - &tenant_shard_id.tenant_id, - &new_location_config, - ) - .await - .map_err(SetNewTenantConfigError::Persist)?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; - // TODO(sharding): make spawn sharding-aware let tenant = tenant_spawn( self.conf, - tenant_shard_id.tenant_id, + tenant_shard_id, &tenant_path, self.resources.clone(), AttachedTenantConf::try_from(new_location_config)?, @@ -1282,8 +1268,7 @@ async fn detach_tenant0( deletion_queue_client: &DeletionQueueClient, ) -> Result { let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { - // TODO(sharding): make local path helpers shard-aware - let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean.tenant_id); + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename")) @@ -1308,8 +1293,7 @@ async fn detach_tenant0( Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) ) { - // TODO(sharding): make local paths sharding-aware - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id.tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); if tenant_ignore_mark.exists() { info!("Detaching an ignored tenant"); let tmp_path = tenant_dir_rename_operation(tenant_shard_id) @@ -1338,9 +1322,9 @@ pub(crate) async fn load_tenant( let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let tenant_path = conf.tenant_path(&tenant_id); + let tenant_path = conf.tenant_path(&tenant_shard_id); - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); if tenant_ignore_mark.exists() { std::fs::remove_file(&tenant_ignore_mark).with_context(|| { format!( @@ -1356,14 +1340,14 @@ pub(crate) async fn load_tenant( }; let mut location_conf = - Tenant::load_tenant_config(conf, &tenant_id).map_err(TenantMapInsertError::Other)?; + Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?; location_conf.attach_in_generation(generation); - Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; + Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; let new_tenant = tenant_spawn( conf, - tenant_id, + tenant_shard_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, @@ -1394,7 +1378,7 @@ async fn ignore_tenant0( let tenant_shard_id = TenantShardId::unsharded(tenant_id); remove_tenant_from_memory(tenants, tenant_shard_id, async { - let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id); + let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id); fs::File::create(&ignore_mark_file) .await .context("Failed to create ignore mark file") @@ -1452,13 +1436,13 @@ pub(crate) async fn attach_tenant( let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; let location_conf = LocationConf::attached_single(tenant_conf, generation); - let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?; + let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 let attached_tenant = tenant_spawn( conf, - tenant_id, + tenant_shard_id, &tenant_dir, resources, AttachedTenantConf::try_from(location_conf)?, @@ -1974,6 +1958,9 @@ pub(crate) async fn immediate_gc( .with_context(|| format!("tenant {tenant_id}")) .map_err(|e| ApiError::NotFound(e.into()))?; + // TODO(sharding): make callers of this function shard-aware + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); @@ -1995,7 +1982,7 @@ pub(crate) async fn immediate_gc( #[allow(unused_mut)] let mut result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .instrument(info_span!("manual_gc", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 01c60ca8f8..183ee19a40 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -188,7 +188,7 @@ use anyhow::Context; use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; -use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; pub(crate) use upload::upload_initdb_dir; @@ -301,7 +301,7 @@ pub struct RemoteTimelineClient { runtime: tokio::runtime::Handle, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, @@ -325,7 +325,7 @@ impl RemoteTimelineClient { remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, ) -> RemoteTimelineClient { @@ -337,13 +337,16 @@ impl RemoteTimelineClient { } else { BACKGROUND_RUNTIME.handle().clone() }, - tenant_id, + tenant_shard_id, timeline_id, generation, storage_impl: remote_storage, deletion_queue_client, upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &tenant_shard_id, + &timeline_id, + )), } } @@ -403,11 +406,6 @@ impl RemoteTimelineClient { Ok(()) } - pub(crate) fn get_shard_index(&self) -> ShardIndex { - // TODO: carry this on the struct - ShardIndex::unsharded() - } - pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, @@ -469,14 +467,13 @@ impl RemoteTimelineClient { let index_part = download::download_index_part( &self.storage_impl, - &self.tenant_id, + &self.tenant_shard_id, &self.timeline_id, - self.get_shard_index(), self.generation, cancel, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Download, @@ -512,13 +509,13 @@ impl RemoteTimelineClient { download::download_layer_file( self.conf, &self.storage_impl, - self.tenant_id, + self.tenant_shard_id, self.timeline_id, layer_file_name, layer_metadata, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Download, @@ -966,9 +963,8 @@ impl RemoteTimelineClient { || { upload::upload_index_part( &self.storage_impl, - &self.tenant_id, + &self.tenant_shard_id, &self.timeline_id, - self.get_shard_index(), self.generation, &index_part_with_deleted_at, ) @@ -1025,7 +1021,7 @@ impl RemoteTimelineClient { .drain() .map(|(file_name, meta)| { remote_layer_path( - &self.tenant_id, + &self.tenant_shard_id.tenant_id, &self.timeline_id, meta.shard, &file_name, @@ -1040,7 +1036,7 @@ impl RemoteTimelineClient { // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage - let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id); + let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't // taking the burden of listing all the layers that we already know we should delete. @@ -1076,12 +1072,7 @@ impl RemoteTimelineClient { .unwrap_or( // No generation-suffixed indices, assume we are dealing with // a legacy index. - remote_index_path( - &self.tenant_id, - &self.timeline_id, - self.get_shard_index(), - Generation::none(), - ), + remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()), ); let remaining_layers: Vec = remaining @@ -1213,12 +1204,12 @@ impl RemoteTimelineClient { // Spawn task to perform the task let self_rc = Arc::clone(self); - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_shard_id; let timeline_id = self.timeline_id; task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "remote upload", false, @@ -1226,7 +1217,7 @@ impl RemoteTimelineClient { self_rc.perform_upload_task(task).await; Ok(()) } - .instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)), + .instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)), ); // Loop back to process next task @@ -1278,7 +1269,7 @@ impl RemoteTimelineClient { self.generation, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Upload, @@ -1298,14 +1289,13 @@ impl RemoteTimelineClient { let res = upload::upload_index_part( &self.storage_impl, - &self.tenant_id, + &self.tenant_shard_id, &self.timeline_id, - self.get_shard_index(), self.generation, index_part, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Upload, @@ -1325,7 +1315,7 @@ impl RemoteTimelineClient { pausable_failpoint!("before-delete-layer-pausable"); self.deletion_queue_client .push_layers( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.generation, delete.layers.clone(), @@ -1444,7 +1434,7 @@ impl RemoteTimelineClient { // data safety guarantees (see docs/rfcs/025-generation-numbers.md) self.deletion_queue_client .update_remote_consistent_lsn( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.generation, lsn, @@ -1602,15 +1592,21 @@ impl RemoteTimelineClient { } } -pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath { - let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}"); +pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { + let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") } -pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath { - remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string())) +pub fn remote_timeline_path( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, +) -> RemotePath { + remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string())) } +/// Note that the shard component of a remote layer path is _not_ always the same +/// as in the TenantShardId of the caller: tenants may reference layers from a different +/// ShardIndex. Use the ShardIndex from the layer's metadata. pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, @@ -1637,14 +1633,12 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId } pub fn remote_index_path( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, - shard: ShardIndex, generation: Generation, ) -> RemotePath { RemotePath::from_string(&format!( - "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", - shard.get_suffix(), + "tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", IndexPart::FILE_NAME, generation.get_suffix() )) @@ -1786,14 +1780,14 @@ mod tests { Arc::new(RemoteTimelineClient { conf: self.harness.conf, runtime: tokio::runtime::Handle::current(), - tenant_id: self.harness.tenant_id, + tenant_shard_id: self.harness.tenant_shard_id, timeline_id: TIMELINE_ID, generation, storage_impl: self.harness.remote_storage.clone(), deletion_queue_client: self.harness.deletion_queue.new_client(), upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( - &self.harness.tenant_id, + &self.harness.tenant_shard_id, &TIMELINE_ID, )), }) @@ -2100,11 +2094,7 @@ mod tests { assert_eq!(actual_c, expected_c); } - async fn inject_index_part( - test_state: &TestSetup, - generation: Generation, - shard: ShardIndex, - ) -> IndexPart { + async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { // An empty IndexPart, just sufficient to ensure deserialization will succeed let example_metadata = TimelineMetadata::example(); let example_index_part = IndexPart::new( @@ -2126,9 +2116,8 @@ mod tests { let index_path = test_state.harness.remote_fs_dir.join( remote_index_path( - &test_state.harness.tenant_id, + &test_state.harness.tenant_shard_id, &TIMELINE_ID, - shard, generation, ) .get_path(), @@ -2168,12 +2157,7 @@ mod tests { // Simple case: we are in generation N, load the index from generation N - 1 let generation_n = 5; - let injected = inject_index_part( - &test_state, - Generation::new(generation_n - 1), - ShardIndex::unsharded(), - ) - .await; + let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await; @@ -2191,34 +2175,22 @@ mod tests { // A generation-less IndexPart exists in the bucket, we should find it let generation_n = 5; - let injected_none = - inject_index_part(&test_state, Generation::none(), ShardIndex::unsharded()).await; + let injected_none = inject_index_part(&test_state, Generation::none()).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await; // If a more recent-than-none generation exists, we should prefer to load that - let injected_1 = - inject_index_part(&test_state, Generation::new(1), ShardIndex::unsharded()).await; + let injected_1 = inject_index_part(&test_state, Generation::new(1)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; // If a more-recent-than-me generation exists, we should ignore it. - let _injected_10 = - inject_index_part(&test_state, Generation::new(10), ShardIndex::unsharded()).await; + let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; // If a directly previous generation exists, _and_ an index exists in my own // generation, I should prefer my own generation. - let _injected_prev = inject_index_part( - &test_state, - Generation::new(generation_n - 1), - ShardIndex::unsharded(), - ) - .await; - let injected_current = inject_index_part( - &test_state, - Generation::new(generation_n), - ShardIndex::unsharded(), - ) - .await; + let _injected_prev = + inject_index_part(&test_state, Generation::new(generation_n - 1)).await; + let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await; assert_got_index_part( &test_state, Generation::new(generation_n), diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 3b2cb5b599..1e9dcfe76a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -9,7 +9,7 @@ use std::time::Duration; use anyhow::{anyhow, Context}; use camino::Utf8Path; -use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::TenantShardId; use tokio::fs; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; @@ -22,7 +22,7 @@ use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_time use crate::tenant::Generation; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use super::index::{IndexPart, LayerFileMetadata}; use super::{ @@ -40,7 +40,7 @@ static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, @@ -48,11 +48,11 @@ pub async fn download_layer_file<'a>( debug_assert_current_span_has_tenant_and_timeline_id(); let local_path = conf - .timeline_path(&tenant_id, &timeline_id) + .timeline_path(&tenant_shard_id, &timeline_id) .join(layer_file_name.file_name()); let remote_path = remote_layer_path( - &tenant_id, + &tenant_shard_id.tenant_id, &timeline_id, layer_metadata.shard, layer_file_name, @@ -171,10 +171,10 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool { /// List timelines of given tenant in remote storage pub async fn list_remote_timelines( storage: &GenericRemoteStorage, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, cancel: CancellationToken, ) -> anyhow::Result<(HashSet, HashSet)> { - let remote_path = remote_timelines_path(&tenant_id); + let remote_path = remote_timelines_path(&tenant_shard_id); fail::fail_point!("storage-sync-list-remote-timelines", |_| { anyhow::bail!("storage-sync-list-remote-timelines"); @@ -182,7 +182,7 @@ pub async fn list_remote_timelines( let listing = download_retry_forever( || storage.list(Some(&remote_path), ListingMode::WithDelimiter), - &format!("list timelines for {tenant_id}"), + &format!("list timelines for {tenant_shard_id}"), cancel, ) .await?; @@ -192,7 +192,7 @@ pub async fn list_remote_timelines( for timeline_remote_storage_key in listing.prefixes { let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}") })?; match object_name.parse::() { @@ -213,13 +213,12 @@ pub async fn list_remote_timelines( async fn do_download_index_part( storage: &GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, - shard: ShardIndex, index_generation: Generation, cancel: CancellationToken, ) -> Result { - let remote_path = remote_index_path(tenant_id, timeline_id, shard, index_generation); + let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); let index_part_bytes = download_retry_forever( || async { @@ -255,9 +254,8 @@ async fn do_download_index_part( #[tracing::instrument(skip_all, fields(generation=?my_generation))] pub(super) async fn download_index_part( storage: &GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, - shard: ShardIndex, my_generation: Generation, cancel: CancellationToken, ) -> Result { @@ -267,9 +265,8 @@ pub(super) async fn download_index_part( // Operating without generations: just fetch the generation-less path return do_download_index_part( storage, - tenant_id, + tenant_shard_id, timeline_id, - shard, my_generation, cancel, ) @@ -282,9 +279,8 @@ pub(super) async fn download_index_part( // This is an optimization to avoid doing the listing for the general case below. let res = do_download_index_part( storage, - tenant_id, + tenant_shard_id, timeline_id, - shard, my_generation, cancel.clone(), ) @@ -310,9 +306,8 @@ pub(super) async fn download_index_part( // This is an optimization to avoid doing the listing for the general case below. let res = do_download_index_part( storage, - tenant_id, + tenant_shard_id, timeline_id, - shard, my_generation.previous(), cancel.clone(), ) @@ -335,7 +330,7 @@ pub(super) async fn download_index_part( // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json // objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent // to constructing a full index path with no generation, because the generation is a suffix. - let index_prefix = remote_index_path(tenant_id, timeline_id, shard, Generation::none()); + let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); let indices = backoff::retry( || async { storage.list_files(Some(&index_prefix)).await }, |_| false, @@ -361,7 +356,7 @@ pub(super) async fn download_index_part( match max_previous_generation { Some(g) => { tracing::debug!("Found index_part in generation {g:?}"); - do_download_index_part(storage, tenant_id, timeline_id, shard, g, cancel).await + do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await } None => { // Migration from legacy pre-generation state: we have a generation but no prior @@ -369,9 +364,8 @@ pub(super) async fn download_index_part( tracing::info!("No index_part.json* found"); do_download_index_part( storage, - tenant_id, + tenant_shard_id, timeline_id, - shard, Generation::none(), cancel, ) diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 789a10cf54..4ca4438003 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -4,7 +4,7 @@ use anyhow::{bail, Context}; use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; -use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::TenantShardId; use std::io::ErrorKind; use tokio::fs; @@ -25,9 +25,8 @@ use tracing::info; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part<'a>( storage: &'a GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, - shard: ShardIndex, generation: Generation, index_part: &'a IndexPart, ) -> anyhow::Result<()> { @@ -44,11 +43,11 @@ pub(super) async fn upload_index_part<'a>( let index_part_size = index_part_bytes.len(); let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); - let remote_path = remote_index_path(tenant_id, timeline_id, shard, generation); + let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); storage .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path) .await - .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'")) + .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Attempts to upload given layer files. diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3b2a61dcba..c933342822 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -24,10 +24,7 @@ use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; use utils::rate_limit::RateLimit; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::{id::TimelineId, lsn::Lsn}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; @@ -304,12 +301,14 @@ pub trait AsLayerDesc { } pub mod tests { + use pageserver_api::shard::TenantShardId; + use super::*; impl From for PersistentLayerDesc { fn from(value: DeltaFileName) -> Self { PersistentLayerDesc::new_delta( - TenantId::from_array([0; 16]), + TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), value.key_range, value.lsn_range, @@ -321,7 +320,7 @@ pub mod tests { impl From for PersistentLayerDesc { fn from(value: ImageFileName) -> Self { PersistentLayerDesc::new_img( - TenantId::from_array([0; 16]), + TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), value.key_range, value.lsn, diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 79f37dcb2d..e9886d90c4 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -42,6 +42,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::models::LayerAccessKind; +use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::File; @@ -86,7 +87,7 @@ pub struct Summary { impl From<&DeltaLayer> for Summary { fn from(layer: &DeltaLayer) -> Self { Self::expected( - layer.desc.tenant_id, + layer.desc.tenant_shard_id.tenant_id, layer.desc.timeline_id, layer.desc.key_range.clone(), layer.desc.lsn_range.clone(), @@ -248,7 +249,7 @@ impl DeltaLayer { fn temp_path_for( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, key_start: Key, lsn_range: &Range, @@ -259,14 +260,15 @@ impl DeltaLayer { .map(char::from) .collect(); - conf.timeline_path(tenant_id, timeline_id).join(format!( - "{}-XXX__{:016X}-{:016X}.{}.{}", - key_start, - u64::from(lsn_range.start), - u64::from(lsn_range.end), - rand_string, - TEMP_FILE_SUFFIX, - )) + conf.timeline_path(tenant_shard_id, timeline_id) + .join(format!( + "{}-XXX__{:016X}-{:016X}.{}.{}", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end), + rand_string, + TEMP_FILE_SUFFIX, + )) } /// @@ -318,10 +320,14 @@ impl DeltaLayer { .metadata() .context("get file metadata to determine size")?; + // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary. + // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn. + let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); + Ok(DeltaLayer { path: path.to_path_buf(), desc: PersistentLayerDesc::new_delta( - summary.tenant_id, + tenant_shard_id, summary.timeline_id, summary.key_range, summary.lsn_range, @@ -353,7 +359,7 @@ struct DeltaLayerWriterInner { conf: &'static PageServerConf, pub path: Utf8PathBuf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, @@ -370,7 +376,7 @@ impl DeltaLayerWriterInner { async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, ) -> anyhow::Result { @@ -380,7 +386,8 @@ impl DeltaLayerWriterInner { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range); + let path = + DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range); let mut file = VirtualFile::create(&path).await?; // make room for the header block @@ -395,7 +402,7 @@ impl DeltaLayerWriterInner { conf, path, timeline_id, - tenant_id, + tenant_shard_id, key_start, lsn_range, tree: tree_builder, @@ -457,7 +464,7 @@ impl DeltaLayerWriterInner { let summary = Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), @@ -498,7 +505,7 @@ impl DeltaLayerWriterInner { // set inner.file here. The first read will have to re-open it. let desc = PersistentLayerDesc::new_delta( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_start..key_end, self.lsn_range.clone(), @@ -549,14 +556,20 @@ impl DeltaLayerWriter { pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, ) -> anyhow::Result { Ok(Self { inner: Some( - DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range) - .await?, + DeltaLayerWriterInner::new( + conf, + timeline_id, + tenant_shard_id, + key_start, + lsn_range, + ) + .await?, ), }) } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index c38a9f6883..208aa07872 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -41,6 +41,7 @@ use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use hex; use pageserver_api::models::LayerAccessKind; +use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::File; @@ -87,7 +88,7 @@ pub(super) struct Summary { impl From<&ImageLayer> for Summary { fn from(layer: &ImageLayer) -> Self { Self::expected( - layer.desc.tenant_id, + layer.desc.tenant_shard_id.tenant_id, layer.desc.timeline_id, layer.desc.key_range.clone(), layer.lsn, @@ -217,7 +218,7 @@ impl ImageLayer { fn temp_path_for( conf: &PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, fname: &ImageFileName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() @@ -226,7 +227,7 @@ impl ImageLayer { .map(char::from) .collect(); - conf.timeline_path(&tenant_id, &timeline_id) + conf.timeline_path(&tenant_shard_id, &timeline_id) .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } @@ -276,10 +277,15 @@ impl ImageLayer { let metadata = file .metadata() .context("get file metadata to determine size")?; + + // TODO(sharding): we should get TenantShardId from path. + // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart. + let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); + Ok(ImageLayer { path: path.to_path_buf(), desc: PersistentLayerDesc::new_img( - summary.tenant_id, + tenant_shard_id, summary.timeline_id, summary.key_range, summary.lsn, @@ -400,7 +406,7 @@ struct ImageLayerWriterInner { conf: &'static PageServerConf, path: Utf8PathBuf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_range: Range, lsn: Lsn, @@ -415,7 +421,7 @@ impl ImageLayerWriterInner { async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, ) -> anyhow::Result { @@ -424,7 +430,7 @@ impl ImageLayerWriterInner { let path = ImageLayer::temp_path_for( conf, timeline_id, - tenant_id, + tenant_shard_id, &ImageFileName { key_range: key_range.clone(), lsn, @@ -448,7 +454,7 @@ impl ImageLayerWriterInner { conf, path, timeline_id, - tenant_id, + tenant_shard_id, key_range: key_range.clone(), lsn, tree: tree_builder, @@ -495,7 +501,7 @@ impl ImageLayerWriterInner { let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, key_range: self.key_range.clone(), lsn: self.lsn, @@ -521,7 +527,7 @@ impl ImageLayerWriterInner { .context("get metadata to determine file size")?; let desc = PersistentLayerDesc::new_img( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_range.clone(), self.lsn, @@ -577,13 +583,14 @@ impl ImageLayerWriter { pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?, + ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn) + .await?, ), }) } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2cb1e55b26..003cf0e92b 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -14,15 +14,11 @@ use crate::tenant::Timeline; use crate::walrecord; use anyhow::{ensure, Result}; use pageserver_api::models::InMemoryLayerInfo; +use pageserver_api::shard::TenantShardId; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; use tracing::*; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, - vec_map::VecMap, -}; +use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; @@ -33,7 +29,7 @@ use super::{DeltaLayerWriter, ResidentLayer}; pub struct InMemoryLayer { conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, /// This layer contains all the changes from 'start_lsn'. The @@ -226,17 +222,17 @@ impl InMemoryLayer { pub async fn create( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, start_lsn: Lsn, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; Ok(InMemoryLayer { conf, timeline_id, - tenant_id, + tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), inner: RwLock::new(InMemoryLayerInner { @@ -335,7 +331,7 @@ impl InMemoryLayer { let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, ) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index c27c3e69ed..3ed4e05bea 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -82,7 +82,7 @@ impl Layer { metadata: LayerFileMetadata, ) -> Self { let desc = PersistentLayerDesc::from_filename( - timeline.tenant_id, + timeline.tenant_shard_id, timeline.timeline_id, file_name, metadata.file_size(), @@ -113,7 +113,7 @@ impl Layer { metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( - timeline.tenant_id, + timeline.tenant_shard_id, timeline.timeline_id, file_name, metadata.file_size(), @@ -486,7 +486,7 @@ impl Drop for LayerInner { return; } - let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id); + let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); let file_name = self.layer_desc().filename(); @@ -561,7 +561,7 @@ impl LayerInner { shard: ShardIndex, ) -> Self { let path = conf - .timeline_path(&timeline.tenant_id, &timeline.timeline_id) + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) .join(desc.filename().to_string()); let (inner, version) = if let Some(inner) = downloaded { @@ -832,7 +832,7 @@ impl LayerInner { crate::task_mgr::spawn( &tokio::runtime::Handle::current(), crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_id), + Some(self.desc.tenant_shard_id.tenant_id), Some(self.desc.timeline_id), &task_name, false, @@ -997,7 +997,7 @@ impl LayerInner { if gc { // do nothing now, only in LayerInner::drop } else if can_evict && evict { - let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version); + let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); // downgrade for queueing, in case there's a tear down already ongoing we should not // hold it alive. @@ -1229,7 +1229,7 @@ impl DownloadedLayer { let res = if owner.desc.is_delta { let summary = Some(delta_layer::Summary::expected( - owner.desc.tenant_id, + owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, owner.desc.key_range.clone(), owner.desc.lsn_range.clone(), @@ -1240,7 +1240,7 @@ impl DownloadedLayer { } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( - owner.desc.tenant_id, + owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, owner.desc.key_range.clone(), lsn, diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index 2e0b0b3e64..bf24407fc5 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -1,9 +1,7 @@ use core::fmt::Display; +use pageserver_api::shard::TenantShardId; use std::ops::Range; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; @@ -11,12 +9,15 @@ use super::{DeltaFileName, ImageFileName, LayerFileName}; use serde::{Deserialize, Serialize}; +#[cfg(test)] +use utils::id::TenantId; + /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct PersistentLayerDesc { - pub tenant_id: TenantId, + pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, /// Range of keys that this layer covers pub key_range: Range, @@ -56,7 +57,7 @@ impl PersistentLayerDesc { #[cfg(test)] pub fn new_test(key_range: Range) -> Self { Self { - tenant_id: TenantId::generate(), + tenant_shard_id: TenantShardId::unsharded(TenantId::generate()), timeline_id: TimelineId::generate(), key_range, lsn_range: Lsn(0)..Lsn(1), @@ -66,14 +67,14 @@ impl PersistentLayerDesc { } pub fn new_img( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key_range: Range, lsn: Lsn, file_size: u64, ) -> Self { Self { - tenant_id, + tenant_shard_id, timeline_id, key_range, lsn_range: Self::image_layer_lsn_range(lsn), @@ -83,14 +84,14 @@ impl PersistentLayerDesc { } pub fn new_delta( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key_range: Range, lsn_range: Range, file_size: u64, ) -> Self { Self { - tenant_id, + tenant_shard_id, timeline_id, key_range, lsn_range, @@ -100,18 +101,22 @@ impl PersistentLayerDesc { } pub fn from_filename( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, filename: LayerFileName, file_size: u64, ) -> Self { match filename { LayerFileName::Image(i) => { - Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size) - } - LayerFileName::Delta(d) => { - Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size) + Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } + LayerFileName::Delta(d) => Self::new_delta( + tenant_shard_id, + timeline_id, + d.key_range, + d.lsn_range, + file_size, + ), } } @@ -172,10 +177,6 @@ impl PersistentLayerDesc { self.timeline_id } - pub fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - /// Does this layer only contain some data for the key-range (incremental), /// or does it contain a version of every page? This is important to know /// for garbage collecting old layers: an incremental layer depends on @@ -192,7 +193,7 @@ impl PersistentLayerDesc { if self.is_delta { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----", - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_range.start, self.key_range.end, @@ -204,7 +205,7 @@ impl PersistentLayerDesc { } else { println!( "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----", - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_range.start, self.key_range.end, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 860bb255ca..138578ec8a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -86,7 +86,7 @@ pub fn start_background_loops( tenant: &Arc, background_jobs_can_start: Option<&completion::Barrier>, ) { - let tenant_id = tenant.tenant_id; + let tenant_id = tenant.tenant_shard_id.tenant_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a7cf427de5..24f59673c1 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -13,8 +13,12 @@ use camino::{Utf8Path, Utf8PathBuf}; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::models::{ - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo, TimelineState, +use pageserver_api::{ + models::{ + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo, + TimelineState, + }, + shard::TenantShardId, }; use serde_with::serde_as; use storage_broker::BrokerClientChannel; @@ -149,7 +153,7 @@ pub struct Timeline { myself: Weak, - pub tenant_id: TenantId, + pub(crate) tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects. @@ -701,7 +705,7 @@ impl Timeline { } /// Flush to disk all data that was written with the put_* functions - #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))] + #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { self.freeze_inmem_layer(false).await; self.flush_frozen_layers_and_wait().await @@ -937,7 +941,7 @@ impl Timeline { tracing::debug!("Waiting for WalReceiverManager..."); task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), ) .await; @@ -988,7 +992,7 @@ impl Timeline { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), ) .await; @@ -1006,7 +1010,12 @@ impl Timeline { tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await; + task_mgr::shutdown_tasks( + None, + Some(self.tenant_shard_id.tenant_id), + Some(self.timeline_id), + ) + .await; // Finally wait until any gate-holders are complete self.gate.close().await; @@ -1125,7 +1134,7 @@ impl Timeline { } } - #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] + #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); @@ -1330,7 +1339,11 @@ impl Timeline { &self.tenant_conf.read().unwrap().tenant_conf, &self.conf.default_tenant_conf, ); - let tenant_id_str = self.tenant_id.to_string(); + + // TODO(sharding): make evictions state shard aware + // (https://github.com/neondatabase/neon/issues/5953) + let tenant_id_str = self.tenant_shard_id.tenant_id.to_string(); + let timeline_id_str = self.timeline_id.to_string(); self.metrics .evictions_with_low_residence_duration @@ -1350,7 +1363,7 @@ impl Timeline { metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, generation: Generation, walredo_mgr: Arc, resources: TimelineResources, @@ -1381,7 +1394,7 @@ impl Timeline { tenant_conf, myself: myself.clone(), timeline_id, - tenant_id, + tenant_shard_id, generation, pg_version, layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())), @@ -1408,7 +1421,7 @@ impl Timeline { ancestor_lsn: metadata.ancestor_lsn(), metrics: TimelineMetrics::new( - &tenant_id, + &tenant_shard_id.tenant_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", @@ -1459,7 +1472,7 @@ impl Timeline { initial_logical_size_can_start, initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), cancel, - gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")), + gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")), compaction_lock: tokio::sync::Mutex::default(), gc_lock: tokio::sync::Mutex::default(), @@ -1481,14 +1494,14 @@ impl Timeline { FlushLoopState::Running { .. } => { info!( "skipping attempt to start flush_loop twice {}/{}", - self.tenant_id, self.timeline_id + self.tenant_shard_id, self.timeline_id ); return; } FlushLoopState::Exited => { warn!( "ignoring attempt to restart exited flush_loop {}/{}", - self.tenant_id, self.timeline_id + self.tenant_shard_id, self.timeline_id ); return; } @@ -1507,7 +1520,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "layer flush task", false, @@ -1519,7 +1532,7 @@ impl Timeline { *flush_loop_state = FlushLoopState::Exited; Ok(()) } - .instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id)) + .instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id)) ); } @@ -1534,7 +1547,7 @@ impl Timeline { ) { info!( "launching WAL receiver for timeline {} of tenant {}", - self.timeline_id, self.tenant_id + self.timeline_id, self.tenant_shard_id ); let tenant_conf_guard = self.tenant_conf.read().unwrap(); @@ -1595,7 +1608,9 @@ impl Timeline { // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); + let timeline_path = self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id); let conf = self.conf; let span = tracing::Span::current(); @@ -1802,7 +1817,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "initial size calculation", false, @@ -1912,7 +1927,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "ondemand logical size calculation", false, @@ -1988,7 +2003,7 @@ impl Timeline { fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { if !self .conf - .metadata_path(&self.tenant_id, &self.timeline_id) + .metadata_path(&self.tenant_shard_id, &self.timeline_id) .exists() { error!("timeline-calculate-logical-size-pre metadata file does not exist") @@ -2341,7 +2356,13 @@ impl Timeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = cache - .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx) + .lookup_materialized_page( + self.tenant_shard_id.tenant_id, + self.timeline_id, + key, + lsn, + ctx, + ) .await?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) @@ -2369,7 +2390,7 @@ impl Timeline { self.get_last_record_lsn(), self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, ) .await?; Ok(layer) @@ -2535,7 +2556,7 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, @@ -2656,9 +2677,14 @@ impl Timeline { // If we updated our disk_consistent_lsn, persist the updated metadata to local disk. if let Some(metadata) = metadata { - save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata) - .await - .context("save_metadata")?; + save_metadata( + self.conf, + &self.tenant_shard_id, + &self.timeline_id, + &metadata, + ) + .await + .context("save_metadata")?; } Ok(()) } @@ -2722,9 +2748,14 @@ impl Timeline { ) -> anyhow::Result<()> { let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; - save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata) - .await - .context("save_metadata")?; + save_metadata( + self.conf, + &self.tenant_shard_id, + &self.timeline_id, + &metadata, + ) + .await + .context("save_metadata")?; Ok(()) } @@ -2772,7 +2803,7 @@ impl Timeline { par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; par_fsync::par_fsync(&[self_clone .conf - .timeline_path(&self_clone.tenant_id, &self_clone.timeline_id)]) + .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) .context("fsync of timeline dir")?; anyhow::Ok(new_delta) @@ -2928,7 +2959,7 @@ impl Timeline { let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, &img_range, lsn, ) @@ -3001,9 +3032,11 @@ impl Timeline { .await .context("fsync of newly created layer files")?; - par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)]) - .await - .context("fsync of timeline dir")?; + par_fsync::par_fsync_async(&[self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) + .await + .context("fsync of timeline dir")?; let mut guard = self.layers.write().await; @@ -3489,7 +3522,7 @@ impl Timeline { DeltaLayerWriter::new( self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, key, if dup_end_lsn.is_valid() { // this is a layer containing slice of values of the same key @@ -3550,7 +3583,9 @@ impl Timeline { .await .context("fsync all new layers")?; - let timeline_dir = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); + let timeline_dir = self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id); par_fsync::par_fsync_async(&[timeline_dir]) .await @@ -3601,7 +3636,7 @@ impl Timeline { let ctx = ctx.attached_child(); let mut stats = CompactLevel0Phase1StatsBuilder { version: Some(2), - tenant_id: Some(self.tenant_id), + tenant_id: Some(self.tenant_shard_id.tenant_id), timeline_id: Some(self.timeline_id), ..Default::default() }; @@ -4062,7 +4097,7 @@ impl Timeline { let cache = page_cache::get(); if let Err(e) = cache .memorize_materialized_page( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, key, last_rec_lsn, @@ -4106,7 +4141,7 @@ impl Timeline { let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "download all remote layers task", false, @@ -4128,7 +4163,7 @@ impl Timeline { }; Ok(()) } - .instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id)) + .instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id)) ); let initial_info = DownloadRemoteLayersTaskInfo { @@ -4329,8 +4364,10 @@ impl Timeline { } pub(crate) fn get_shard_index(&self) -> ShardIndex { - // TODO: carry this on the struct - ShardIndex::unsharded() + ShardIndex { + shard_number: self.tenant_shard_id.shard_number, + shard_count: self.tenant_shard_id.shard_count, + } } } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index fefeafb7d3..497796c80a 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -4,13 +4,10 @@ use std::{ }; use anyhow::Context; -use pageserver_api::models::TimelineState; +use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; use tracing::{debug, error, info, instrument, warn, Instrument, Span}; -use utils::{ - crashsafe, fs_ext, - id::{TenantId, TimelineId}, -}; +use utils::{crashsafe, fs_ext, id::TimelineId}; use crate::{ config::PageServerConf, @@ -47,7 +44,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_id), + Some(timeline.tenant_shard_id.tenant_id), Some(timeline.timeline_id), ) .await; @@ -73,7 +70,12 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { // NB: This and other delete_timeline calls do not run as a task_mgr task, // so, they are not affected by this shutdown_tasks() call. info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await; + task_mgr::shutdown_tasks( + None, + Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.timeline_id), + ) + .await; fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { Err(anyhow::anyhow!( @@ -125,7 +127,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi // pub(super): documentation link pub(super) async fn delete_local_layer_files( conf: &PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline: &Timeline, ) -> anyhow::Result<()> { let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) }; @@ -139,7 +141,7 @@ pub(super) async fn delete_local_layer_files( // NB: storage_sync upload tasks that reference these layers have been cancelled // by the caller. - let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id); + let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id); fail::fail_point!("timeline-delete-before-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? @@ -175,7 +177,7 @@ pub(super) async fn delete_local_layer_files( return Ok(()); } - let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id); + let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id); for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) { #[cfg(feature = "testing")] @@ -250,11 +252,11 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<( // (nothing can fail after its deletion) async fn cleanup_remaining_timeline_fs_traces( conf: &PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> anyhow::Result<()> { // Remove local metadata - tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id)) + tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id)) .await .or_else(fs_ext::ignore_not_found) .context("remove metadata")?; @@ -266,7 +268,7 @@ async fn cleanup_remaining_timeline_fs_traces( }); // Remove timeline dir - tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id)) + tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id)) .await .or_else(fs_ext::ignore_not_found) .context("timeline dir")?; @@ -281,7 +283,7 @@ async fn cleanup_remaining_timeline_fs_traces( // to be reordered later and thus missed if a crash occurs. // Note that we dont need to sync after mark file is removed // because we can tolerate the case when mark file reappears on startup. - let timeline_path = conf.timelines_path(&tenant_id); + let timeline_path = conf.timelines_path(&tenant_shard_id); crashsafe::fsync_async(timeline_path) .await .context("fsync_pre_mark_remove")?; @@ -289,7 +291,7 @@ async fn cleanup_remaining_timeline_fs_traces( // Remove delete mark // TODO: once we are confident that no more exist in the field, remove this // line. It cleans up a legacy marker file that might in rare cases be present. - tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id)) + tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id)) .await .or_else(fs_ext::ignore_not_found) .context("remove delete mark") @@ -355,7 +357,7 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))] + #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))] pub async fn run( tenant: &Arc, timeline_id: TimelineId, @@ -451,7 +453,8 @@ impl DeleteTimelineFlow { timeline_id: TimelineId, ) -> anyhow::Result<()> { let r = - cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await; + cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id) + .await; info!("Done"); r } @@ -522,13 +525,13 @@ impl DeleteTimelineFlow { tenant: Arc, timeline: Arc, ) { - let tenant_id = timeline.tenant_id; + let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_id), + Some(tenant_shard_id.tenant_id), Some(timeline_id), "timeline_delete", false, @@ -541,7 +544,7 @@ impl DeleteTimelineFlow { } .instrument({ let span = - tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id); + tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id); span.follows_from(Span::current()); span }), @@ -554,13 +557,14 @@ impl DeleteTimelineFlow { tenant: &Tenant, timeline: &Timeline, ) -> Result<(), DeleteTimelineError> { - delete_local_layer_files(conf, tenant.tenant_id, timeline).await?; + delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?; delete_remote_layers_and_index(timeline).await?; pausable_failpoint!("in_progress_delete"); - cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?; + cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id) + .await?; remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 2d0f1c609b..3fe4bc0f83 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -60,9 +60,12 @@ impl Timeline { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), - &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id), + &format!( + "layer eviction for {}/{}", + self.tenant_shard_id, self.timeline_id + ), false, async move { let cancel = task_mgr::shutdown_token(); @@ -77,7 +80,7 @@ impl Timeline { ); } - #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] + #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { use crate::tenant::tasks::random_init_delay; { @@ -340,7 +343,7 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) { + let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) { Ok(t) => t, Err(_) => { return ControlFlow::Break(()); diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 7e1aa279d3..dcd82949dd 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,8 +1,9 @@ use anyhow::{bail, ensure, Context, Result}; +use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; use tracing::trace; use utils::{ - id::{TenantId, TimelineId}, + id::TimelineId, lsn::{AtomicLsn, Lsn}, }; @@ -73,7 +74,7 @@ impl LayerManager { last_record_lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, ) -> Result> { ensure!(lsn.is_aligned()); @@ -109,7 +110,8 @@ impl LayerManager { lsn ); - let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?; + let new_layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index f9bb6ca419..61130f541a 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -43,11 +43,11 @@ impl<'t> UninitializedTimeline<'t> { /// The caller is responsible for activating the timeline (function `.activate()`). pub(crate) fn finish_creation(mut self) -> anyhow::Result> { let timeline_id = self.timeline_id; - let tenant_id = self.owning_tenant.tenant_id; + let tenant_shard_id = self.owning_tenant.tenant_shard_id; if self.raw_timeline.is_none() { return Err(anyhow::anyhow!( - "No timeline for initialization found for {tenant_id}/{timeline_id}" + "No timeline for initialization found for {tenant_shard_id}/{timeline_id}" )); } @@ -61,13 +61,13 @@ impl<'t> UninitializedTimeline<'t> { anyhow::ensure!( new_disk_consistent_lsn.is_valid(), - "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn" + "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn" ); let mut timelines = self.owning_tenant.timelines.lock().unwrap(); match timelines.entry(timeline_id) { Entry::Occupied(_) => anyhow::bail!( - "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" + "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" ), Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not @@ -79,7 +79,7 @@ impl<'t> UninitializedTimeline<'t> { // this should be an assertion. uninit_mark.remove_uninit_mark().with_context(|| { format!( - "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" + "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}" ) })?; v.insert(Arc::clone(&new_timeline)); @@ -134,7 +134,7 @@ impl<'t> UninitializedTimeline<'t> { .with_context(|| { format!( "No raw timeline {}/{} found", - self.owning_tenant.tenant_id, self.timeline_id + self.owning_tenant.tenant_shard_id, self.timeline_id ) })? .0) @@ -144,7 +144,7 @@ impl<'t> UninitializedTimeline<'t> { impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { if let Some((_, uninit_mark)) = self.raw_timeline.take() { - let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered(); + let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); error!("Timeline got dropped without initializing, cleaning its files"); cleanup_timeline_directory(uninit_mark); } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 842bc3675c..04ff8602d6 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -71,7 +71,7 @@ impl WalReceiver { mut broker_client: BrokerClientChannel, ctx: &RequestContext, ) -> Self { - let tenant_id = timeline.tenant_id; + let tenant_id = timeline.tenant_shard_id.tenant_id; let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 3077712445..7bfa246eeb 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -75,7 +75,7 @@ pub(super) async fn connection_manager_loop_step( } let id = TenantTimelineId { - tenant_id: connection_manager_state.timeline.tenant_id, + tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id, timeline_id: connection_manager_state.timeline.timeline_id, }; @@ -388,7 +388,7 @@ struct BrokerSkTimeline { impl ConnectionManagerState { pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { let id = TenantTimelineId { - tenant_id: timeline.tenant_id, + tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, }; Self { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3e56753ad4..2b4aea7596 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection( task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_id), + Some(timeline.tenant_shard_id.tenant_id), Some(timeline.timeline_id), "walreceiver connection", false, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 4e684dec2d..ed468f220e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -41,6 +41,9 @@ use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; #[cfg(feature = "testing")] use std::sync::atomic::{AtomicUsize, Ordering}; +#[cfg(feature = "testing")] +use pageserver_api::shard::TenantShardId; + use crate::config::PageServerConf; use crate::metrics::{ WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, @@ -991,7 +994,11 @@ impl WalRedoProcess { // these files will be collected to an allure report let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - let path = self.conf.tenant_path(&self.tenant_id).join(&filename); + // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId. + let path = self + .conf + .tenant_path(&TenantShardId::unsharded(self.tenant_id)) + .join(&filename); let res = std::fs::OpenOptions::new() .write(true)