mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-24 08:30:37 +00:00
Merge branch 'main' into vlad/get-vectored-read-path
This commit is contained in:
@@ -272,6 +272,12 @@ fn start_pageserver(
|
||||
);
|
||||
set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
set_launch_timestamp_metric(launch_ts);
|
||||
#[cfg(target_os = "linux")]
|
||||
metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
|
||||
metrics::register_internal(Box::new(
|
||||
pageserver::metrics::tokio_epoll_uring::Collector::new(),
|
||||
))
|
||||
.unwrap();
|
||||
pageserver::preinitialize_metrics();
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
|
||||
@@ -1383,6 +1383,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
@@ -1450,6 +1451,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
max_keys_per_list_response: None,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
);
|
||||
|
||||
@@ -234,7 +234,7 @@ impl DeletionHeader {
|
||||
let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
|
||||
let header_path = conf.deletion_header_path();
|
||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion header")?;
|
||||
|
||||
@@ -325,7 +325,7 @@ impl DeletionList {
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
|
||||
|
||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion list")
|
||||
.map_err(Into::into)
|
||||
@@ -834,7 +834,6 @@ mod test {
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
@@ -868,6 +867,7 @@ mod test {
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
|
||||
let storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
@@ -1171,6 +1171,7 @@ pub(crate) mod mock {
|
||||
pub struct ConsumerState {
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl ConsumerState {
|
||||
@@ -1184,7 +1185,7 @@ pub(crate) mod mock {
|
||||
match msg {
|
||||
DeleterMessage::Delete(objects) => {
|
||||
for path in objects {
|
||||
match remote_storage.delete(&path).await {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
@@ -1217,7 +1218,7 @@ pub(crate) mod mock {
|
||||
|
||||
for path in objects {
|
||||
info!("Executing deletion {path}");
|
||||
match remote_storage.delete(&path).await {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
@@ -1267,7 +1268,11 @@ pub(crate) mod mock {
|
||||
executor_tx,
|
||||
executed,
|
||||
remote_storage,
|
||||
consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
|
||||
consumer: std::sync::Mutex::new(ConsumerState {
|
||||
rx,
|
||||
executor_rx,
|
||||
cancel: CancellationToken::new(),
|
||||
}),
|
||||
lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use remote_storage::MAX_KEYS_PER_DELETE;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -71,9 +72,11 @@ impl Deleter {
|
||||
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
|
||||
});
|
||||
|
||||
self.remote_storage.delete_objects(&self.accumulator).await
|
||||
self.remote_storage
|
||||
.delete_objects(&self.accumulator, &self.cancel)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
3,
|
||||
10,
|
||||
"executing deletion batch",
|
||||
|
||||
@@ -351,7 +351,6 @@ pub enum IterationOutcome<U> {
|
||||
Finished(IterationOutcomeFinished<U>),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct IterationOutcomeFinished<U> {
|
||||
/// The actual usage observed before we started the iteration.
|
||||
@@ -366,7 +365,6 @@ pub struct IterationOutcomeFinished<U> {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[allow(dead_code)]
|
||||
struct AssumedUsage<U> {
|
||||
/// The expected value for `after`, after phase 2.
|
||||
projected_after: U,
|
||||
@@ -374,14 +372,12 @@ struct AssumedUsage<U> {
|
||||
failed: LayerCount,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
struct PlannedUsage<U> {
|
||||
respecting_tenant_min_resident_size: U,
|
||||
fallback_to_global_lru: Option<U>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
struct LayerCount {
|
||||
file_sizes: u64,
|
||||
@@ -565,7 +561,6 @@ pub(crate) struct EvictionSecondaryLayer {
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum EvictionLayer {
|
||||
Attached(Layer),
|
||||
#[allow(dead_code)]
|
||||
Secondary(EvictionSecondaryLayer),
|
||||
}
|
||||
|
||||
@@ -623,6 +618,7 @@ impl std::fmt::Display for EvictionLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct DiskUsageEvictionInfo {
|
||||
/// Timeline's largest layer (remote or resident)
|
||||
pub max_layer_size: Option<u64>,
|
||||
@@ -854,19 +850,27 @@ async fn collect_eviction_candidates(
|
||||
|
||||
let total = tenant_candidates.len();
|
||||
|
||||
for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
|
||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||
// be 1.0; this is for us to evict it last.
|
||||
candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
|
||||
let tenant_candidates =
|
||||
tenant_candidates
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, mut candidate)| {
|
||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||
// be 1.0; this is for us to evict it last.
|
||||
candidate.relative_last_activity =
|
||||
eviction_order.relative_last_activity(total, i);
|
||||
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
} else {
|
||||
MinResidentSizePartition::Below
|
||||
};
|
||||
cumsum += i128::from(candidate.layer.get_file_size());
|
||||
candidates.push((partition, candidate));
|
||||
}
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
} else {
|
||||
MinResidentSizePartition::Below
|
||||
};
|
||||
cumsum += i128::from(candidate.layer.get_file_size());
|
||||
|
||||
(partition, candidate)
|
||||
});
|
||||
|
||||
candidates.extend(tenant_candidates);
|
||||
}
|
||||
|
||||
// Note: the same tenant ID might be hit twice, if it transitions from attached to
|
||||
@@ -882,21 +886,41 @@ async fn collect_eviction_candidates(
|
||||
);
|
||||
|
||||
for secondary_tenant in secondary_tenants {
|
||||
let mut layer_info = secondary_tenant.get_layers_for_eviction();
|
||||
// for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
|
||||
// to prevent repeated disk usage based evictions from completely draining less often
|
||||
// updating secondaries.
|
||||
let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
|
||||
|
||||
debug_assert!(
|
||||
total_layers >= layer_info.resident_layers.len(),
|
||||
"total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
|
||||
layer_info.resident_layers.len()
|
||||
);
|
||||
|
||||
layer_info
|
||||
.resident_layers
|
||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
|
||||
candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
|
||||
(
|
||||
// Secondary locations' layers are always considered above the min resident size,
|
||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
||||
// the layers have sufficiently old access times.
|
||||
MinResidentSizePartition::Above,
|
||||
candidate,
|
||||
)
|
||||
}));
|
||||
let tenant_candidates =
|
||||
layer_info
|
||||
.resident_layers
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, mut candidate)| {
|
||||
candidate.relative_last_activity =
|
||||
eviction_order.relative_last_activity(total_layers, i);
|
||||
(
|
||||
// Secondary locations' layers are always considered above the min resident size,
|
||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
||||
// the layers have sufficiently old access times.
|
||||
MinResidentSizePartition::Above,
|
||||
candidate,
|
||||
)
|
||||
});
|
||||
|
||||
candidates.extend(tenant_candidates);
|
||||
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
@@ -1076,7 +1100,6 @@ mod filesystem_level_usage {
|
||||
use super::DiskUsageEvictionTaskConfig;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[allow(dead_code)]
|
||||
pub struct Usage<'a> {
|
||||
config: &'a DiskUsageEvictionTaskConfig,
|
||||
|
||||
|
||||
@@ -19,11 +19,14 @@ use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::ShardCount;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
@@ -419,6 +422,7 @@ async fn build_timeline_info_common(
|
||||
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
||||
tenant::timeline::logical_size::Accuracy::Exact => true,
|
||||
},
|
||||
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: None,
|
||||
timeline_dir_layer_file_size_sum: None,
|
||||
@@ -485,7 +489,9 @@ async fn timeline_create_handler(
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -495,48 +501,62 @@ async fn timeline_create_handler(
|
||||
tracing::info!("bootstrapping");
|
||||
}
|
||||
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
request_data.existing_initdb_timeline_id,
|
||||
state.broker_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await {
|
||||
match tenant
|
||||
.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id,
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
request_data.existing_initdb_timeline_id,
|
||||
state.broker_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(new_timeline) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let timeline_info = build_timeline_info_common(
|
||||
&new_timeline,
|
||||
&ctx,
|
||||
tenant::timeline::GetLogicalSizePriority::User,
|
||||
)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Err(_) if tenant.cancel.is_cancelled() => {
|
||||
// In case we get some ugly error type during shutdown, cast it into a clean 503.
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
|
||||
json_response(StatusCode::CONFLICT, ())
|
||||
}
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
|
||||
json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
|
||||
format!("{err:#}")
|
||||
))
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::ShuttingDown) => {
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
|
||||
json_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
|
||||
)
|
||||
}
|
||||
Err(
|
||||
tenant::CreateTimelineError::Conflict
|
||||
| tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, ()),
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||
),
|
||||
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
HttpErrorBody::from_msg("tenant shutting down".to_string()),
|
||||
),
|
||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("timeline_create",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
timeline_id = %new_timeline_id,
|
||||
lsn=?request_data.ancestor_start_lsn,
|
||||
pg_version=?request_data.pg_version
|
||||
))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -875,7 +895,7 @@ async fn tenant_reset_handler(
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.tenant_manager
|
||||
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
|
||||
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1104,6 +1124,25 @@ async fn tenant_size_handler(
|
||||
)
|
||||
}
|
||||
|
||||
async fn tenant_shard_split_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let req: TenantShardSplitRequest = json_request(&mut request).await?;
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
|
||||
}
|
||||
|
||||
async fn layer_map_info_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1908,6 +1947,16 @@ async fn post_tracing_event_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_engine_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
|
||||
crate::virtual_file::io_engine::set(kind);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -2054,6 +2103,9 @@ pub fn make_router(
|
||||
.put("/v1/tenant/config", |r| {
|
||||
api_handler(r, update_tenant_config_handler)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
|
||||
api_handler(r, tenant_shard_split_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_shard_id/config", |r| {
|
||||
api_handler(r, get_tenant_config_handler)
|
||||
})
|
||||
@@ -2163,7 +2215,8 @@ pub fn make_router(
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
|
||||
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -602,6 +602,15 @@ pub(crate) mod initial_logical_size {
|
||||
});
|
||||
}
|
||||
|
||||
static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_directory_entries_count",
|
||||
"Sum of the entries in pageserver-stored directory listings",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
@@ -1809,6 +1818,7 @@ pub(crate) struct TimelineMetrics {
|
||||
resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
pub persistent_bytes_written: IntCounter,
|
||||
pub evictions: IntCounter,
|
||||
@@ -1818,12 +1828,12 @@ pub(crate) struct TimelineMetrics {
|
||||
impl TimelineMetrics {
|
||||
pub fn new(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
timeline_id_raw: &TimelineId,
|
||||
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
||||
) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let timeline_id = timeline_id_raw.to_string();
|
||||
let flush_time_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::LayerFlush,
|
||||
&tenant_id,
|
||||
@@ -1876,6 +1886,22 @@ impl TimelineMetrics {
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
|
||||
let directory_entries_count_gauge_closure = {
|
||||
let tenant_shard_id = *tenant_shard_id;
|
||||
let timeline_id_raw = *timeline_id_raw;
|
||||
move || {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id_raw.to_string();
|
||||
let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
gauge
|
||||
}
|
||||
};
|
||||
let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
|
||||
Lazy::new(Box::new(directory_entries_count_gauge_closure));
|
||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -1902,6 +1928,7 @@ impl TimelineMetrics {
|
||||
last_record_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
num_persistent_files_created,
|
||||
persistent_bytes_written,
|
||||
evictions,
|
||||
@@ -1944,6 +1971,9 @@ impl Drop for TimelineMetrics {
|
||||
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ =
|
||||
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
@@ -2400,6 +2430,72 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
}
|
||||
|
||||
pub mod tokio_epoll_uring {
|
||||
use metrics::UIntGauge;
|
||||
|
||||
pub struct Collector {
|
||||
descs: Vec<metrics::core::Desc>,
|
||||
systems_created: UIntGauge,
|
||||
systems_destroyed: UIntGauge,
|
||||
}
|
||||
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
impl metrics::core::Collector for Collector {
|
||||
fn desc(&self) -> Vec<&metrics::core::Desc> {
|
||||
self.descs.iter().collect()
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||
let mut mfs = Vec::with_capacity(NMETRICS);
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
} = tokio_epoll_uring::metrics::global();
|
||||
self.systems_created.set(systems_created);
|
||||
mfs.extend(self.systems_created.collect());
|
||||
self.systems_destroyed.set(systems_destroyed);
|
||||
mfs.extend(self.systems_destroyed.collect());
|
||||
mfs
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
#[allow(clippy::new_without_default)]
|
||||
pub fn new() -> Self {
|
||||
let mut descs = Vec::new();
|
||||
|
||||
let systems_created = UIntGauge::new(
|
||||
"pageserver_tokio_epoll_uring_systems_created",
|
||||
"counter of tokio-epoll-uring systems that were created",
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(
|
||||
metrics::core::Collector::desc(&systems_created)
|
||||
.into_iter()
|
||||
.cloned(),
|
||||
);
|
||||
|
||||
let systems_destroyed = UIntGauge::new(
|
||||
"pageserver_tokio_epoll_uring_systems_destroyed",
|
||||
"counter of tokio-epoll-uring systems that were destroyed",
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(
|
||||
metrics::core::Collector::desc(&systems_destroyed)
|
||||
.into_iter()
|
||||
.cloned(),
|
||||
);
|
||||
|
||||
Self {
|
||||
descs,
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
|
||||
@@ -91,8 +91,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
|
||||
/// and check that there is no more data after the EOF marker.
|
||||
///
|
||||
/// XXX: Currently, any trailing data after the EOF marker prints a warning.
|
||||
/// Perhaps it should be a hard error?
|
||||
/// 'tar' command can also write extra blocks of zeros, up to a record
|
||||
/// size, controlled by the --record-size argument. Ignore them too.
|
||||
async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
|
||||
use tokio::io::AsyncReadExt;
|
||||
let mut buf = [0u8; 512];
|
||||
@@ -113,17 +113,24 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
||||
anyhow::bail!("invalid tar EOF marker");
|
||||
}
|
||||
|
||||
// Drain any data after the EOF marker
|
||||
// Drain any extra zero-blocks after the EOF marker
|
||||
let mut trailing_bytes = 0;
|
||||
let mut seen_nonzero_bytes = false;
|
||||
loop {
|
||||
let nbytes = reader.read(&mut buf).await?;
|
||||
trailing_bytes += nbytes;
|
||||
if !buf.iter().all(|&x| x == 0) {
|
||||
seen_nonzero_bytes = true;
|
||||
}
|
||||
if nbytes == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if trailing_bytes > 0 {
|
||||
warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
|
||||
if seen_nonzero_bytes {
|
||||
anyhow::bail!("unexpected non-zero bytes after the tar archive");
|
||||
}
|
||||
if trailing_bytes % 512 != 0 {
|
||||
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_i
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -155,6 +156,8 @@ impl Timeline {
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_aux_files: None,
|
||||
pending_directory_entries: Vec::new(),
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
@@ -868,6 +871,15 @@ pub struct DatadirModification<'a> {
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
// If we already wrote any aux file changes in this modification, stash the latest dir. If set,
|
||||
// [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
|
||||
// if AUX_FILES_KEY is already set.
|
||||
pending_aux_files: Option<AuxFilesDirectory>,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
@@ -899,6 +911,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = DbDirectory::ser(&DbDirectory {
|
||||
dbdirs: HashMap::new(),
|
||||
})?;
|
||||
self.pending_directory_entries.push((DirectoryKind::Db, 0));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory
|
||||
@@ -907,16 +920,24 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||
xids: HashSet::new(),
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::TwoPhase, 0));
|
||||
self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
|
||||
let empty_dir = Value::Image(buf);
|
||||
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(
|
||||
slru_dir_to_key(SlruKind::MultiXactMembers),
|
||||
empty_dir.clone(),
|
||||
);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1017,6 +1038,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = RelDirectory::ser(&RelDirectory {
|
||||
rels: HashSet::new(),
|
||||
})?;
|
||||
self.pending_directory_entries.push((DirectoryKind::Rel, 0));
|
||||
self.put(
|
||||
rel_dir_to_key(spcnode, dbnode),
|
||||
Value::Image(Bytes::from(buf)),
|
||||
@@ -1039,6 +1061,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::TwoPhase, dir.xids.len()));
|
||||
self.put(
|
||||
TWOPHASEDIR_KEY,
|
||||
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
||||
@@ -1074,6 +1098,8 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut dir = DbDirectory::des(&buf)?;
|
||||
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
|
||||
let buf = DbDirectory::ser(&dir)?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Db, dir.dbdirs.len()));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
} else {
|
||||
warn!(
|
||||
@@ -1111,6 +1137,8 @@ impl<'a> DatadirModification<'a> {
|
||||
// Didn't exist. Update dbdir
|
||||
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// and create the RelDirectory
|
||||
@@ -1125,6 +1153,10 @@ impl<'a> DatadirModification<'a> {
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, rel_dir.rels.len()));
|
||||
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
@@ -1216,6 +1248,9 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
let mut dir = RelDirectory::des(&buf)?;
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, dir.rels.len()));
|
||||
|
||||
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
||||
} else {
|
||||
@@ -1251,6 +1286,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.segments.insert(segno) {
|
||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
|
||||
self.put(
|
||||
dir_key,
|
||||
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
||||
@@ -1295,6 +1332,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.segments.remove(&segno) {
|
||||
warn!("slru segment {:?}/{} does not exist", kind, segno);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
|
||||
self.put(
|
||||
dir_key,
|
||||
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
||||
@@ -1325,6 +1364,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.xids.remove(&xid) {
|
||||
warn!("twophase file for xid {} does not exist", xid);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::TwoPhase, dir.xids.len()));
|
||||
self.put(
|
||||
TWOPHASEDIR_KEY,
|
||||
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
||||
@@ -1340,6 +1381,8 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, 0));
|
||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1350,28 +1393,76 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(buf) => AuxFilesDirectory::des(&buf)?,
|
||||
Err(e) => {
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
let dir = if let Some(mut dir) = self.pending_aux_files.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value
|
||||
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
|
||||
// we are assuming that all _other_ possible errors represents a missing key. If some
|
||||
// other error occurs, we may incorrectly reset the map of aux files.
|
||||
Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
dir
|
||||
}
|
||||
}
|
||||
};
|
||||
let path = path.to_string();
|
||||
if content.is_empty() {
|
||||
dir.files.remove(&path);
|
||||
} else {
|
||||
dir.files.insert(path, Bytes::copy_from_slice(content));
|
||||
}
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, dir.files.len()));
|
||||
self.pending_aux_files = Some(dir);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1427,6 +1518,10 @@ impl<'a> DatadirModification<'a> {
|
||||
self.pending_nblocks = 0;
|
||||
}
|
||||
|
||||
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1464,6 +1559,10 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
}
|
||||
|
||||
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1573,8 +1672,18 @@ struct RelDirectory {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct AuxFilesDirectory {
|
||||
files: HashMap<String, Bytes>,
|
||||
pub(crate) struct AuxFilesDirectory {
|
||||
pub(crate) files: HashMap<String, Bytes>,
|
||||
}
|
||||
|
||||
impl AuxFilesDirectory {
|
||||
pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
|
||||
if let Some(value) = value {
|
||||
self.files.insert(key, value);
|
||||
} else {
|
||||
self.files.remove(&key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -1588,13 +1697,82 @@ struct SlruSegmentDirectory {
|
||||
segments: HashSet<u32>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
|
||||
#[repr(u8)]
|
||||
pub(crate) enum DirectoryKind {
|
||||
Db,
|
||||
TwoPhase,
|
||||
Rel,
|
||||
AuxFiles,
|
||||
SlruSegment(SlruKind),
|
||||
}
|
||||
|
||||
impl DirectoryKind {
|
||||
pub(crate) const KINDS_NUM: usize = <DirectoryKind as Enum>::LENGTH;
|
||||
pub(crate) fn offset(&self) -> usize {
|
||||
self.into_usize()
|
||||
}
|
||||
}
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
//use super::repo_harness::*;
|
||||
//use super::*;
|
||||
use hex_literal::hex;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
|
||||
|
||||
/// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
|
||||
#[tokio::test]
|
||||
async fn aux_files_round_trip() -> anyhow::Result<()> {
|
||||
let name = "aux_files_round_trip";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
|
||||
pub const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
// First modification: insert two keys
|
||||
let mut modification = tline.begin_modification(Lsn(0x1000));
|
||||
modification.put_file("foo/bar1", b"content1", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x1008))?;
|
||||
modification.put_file("foo/bar2", b"content2", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
let expect_1008 = HashMap::from([
|
||||
("foo/bar1".to_string(), Bytes::from_static(b"content1")),
|
||||
("foo/bar2".to_string(), Bytes::from_static(b"content2")),
|
||||
]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
// Second modification: update one key, remove the other
|
||||
let mut modification = tline.begin_modification(Lsn(0x2000));
|
||||
modification.put_file("foo/bar1", b"content3", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x2008))?;
|
||||
modification.put_file("foo/bar2", b"", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
let expect_2008 =
|
||||
HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
|
||||
assert_eq!(readback, expect_2008);
|
||||
|
||||
// Reading back in time works
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
|
||||
|
||||
@@ -30,10 +30,6 @@
|
||||
//! only a single tenant or timeline.
|
||||
//!
|
||||
|
||||
// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
|
||||
// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
|
||||
#![allow(clippy::declare_interior_mutable_const)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
@@ -312,7 +308,6 @@ struct MutableTaskState {
|
||||
}
|
||||
|
||||
struct PageServerTask {
|
||||
#[allow(dead_code)] // unused currently
|
||||
task_id: PageserverTaskId,
|
||||
|
||||
kind: TaskKind,
|
||||
@@ -576,8 +571,8 @@ pub fn shutdown_token() -> CancellationToken {
|
||||
|
||||
/// Has the current task been requested to shut down?
|
||||
pub fn is_shutdown_requested() -> bool {
|
||||
if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
|
||||
cancel.is_cancelled()
|
||||
if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) {
|
||||
true_or_false
|
||||
} else {
|
||||
if !cfg!(test) {
|
||||
warn!("is_shutdown_requested() called in an unexpected task or thread");
|
||||
|
||||
@@ -25,6 +25,7 @@ use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::fmt;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
@@ -53,6 +54,7 @@ use self::metadata::TimelineMetadata;
|
||||
use self::mgr::GetActiveTenantError;
|
||||
use self::mgr::GetTenantError;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::upload::upload_index_part;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineExclusionError;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
@@ -643,10 +645,10 @@ impl Tenant {
|
||||
|
||||
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
||||
// we shut down while attaching.
|
||||
let Ok(attach_gate_guard) = tenant.gate.enter() else {
|
||||
// We just created the Tenant: nothing else can have shut it down yet
|
||||
unreachable!();
|
||||
};
|
||||
let attach_gate_guard = tenant
|
||||
.gate
|
||||
.enter()
|
||||
.expect("We just created the Tenant: nothing else can have shut it down yet");
|
||||
|
||||
// Do all the hard work in the background
|
||||
let tenant_clone = Arc::clone(&tenant);
|
||||
@@ -754,36 +756,27 @@ impl Tenant {
|
||||
AttachType::Normal
|
||||
};
|
||||
|
||||
let preload_timer = TENANT.preload.start_timer();
|
||||
let preload = match mode {
|
||||
SpawnMode::Create => {
|
||||
// Don't count the skipped preload into the histogram of preload durations
|
||||
preload_timer.stop_and_discard();
|
||||
let preload = match (&mode, &remote_storage) {
|
||||
(SpawnMode::Create, _) => {
|
||||
None
|
||||
},
|
||||
SpawnMode::Normal => {
|
||||
match &remote_storage {
|
||||
Some(remote_storage) => Some(
|
||||
match tenant_clone
|
||||
.preload(remote_storage, task_mgr::shutdown_token())
|
||||
.instrument(
|
||||
tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
|
||||
)
|
||||
.await {
|
||||
Ok(p) => {
|
||||
preload_timer.observe_duration();
|
||||
p
|
||||
}
|
||||
,
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
return Ok(());
|
||||
}
|
||||
},
|
||||
),
|
||||
None => None,
|
||||
(SpawnMode::Normal, Some(remote_storage)) => {
|
||||
let _preload_timer = TENANT.preload.start_timer();
|
||||
let res = tenant_clone
|
||||
.preload(remote_storage, task_mgr::shutdown_token())
|
||||
.await;
|
||||
match res {
|
||||
Ok(p) => Some(p),
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
(SpawnMode::Normal, None) => {
|
||||
let _preload_timer = TENANT.preload.start_timer();
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Remote preload is complete.
|
||||
@@ -819,36 +812,37 @@ impl Tenant {
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
match DeleteTenantFlow::resume_from_attach(
|
||||
let deleted = DeleteTenantFlow::resume_from_attach(
|
||||
deletion,
|
||||
&tenant_clone,
|
||||
preload,
|
||||
tenants,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => return Ok(()),
|
||||
.await;
|
||||
|
||||
if let Err(e) = deleted {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
|
||||
let attach_timer = match mode {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
let attached = {
|
||||
let _attach_timer = match mode {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
};
|
||||
tenant_clone.attach(preload, mode, &ctx).await
|
||||
};
|
||||
match tenant_clone.attach(preload, mode, &ctx).await {
|
||||
|
||||
match attached {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
if let Some(t)= attach_timer {t.observe_duration();}
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
if let Some(t)= attach_timer {t.observe_duration();}
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
}
|
||||
@@ -861,34 +855,26 @@ impl Tenant {
|
||||
// logical size calculations: if logical size calculation semaphore is saturated,
|
||||
// then warmup will wait for that before proceeding to the next tenant.
|
||||
if let AttachType::Warmup(_permit) = attach_type {
|
||||
let mut futs = FuturesUnordered::new();
|
||||
let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
|
||||
for t in timelines {
|
||||
futs.push(t.await_initial_logical_size())
|
||||
}
|
||||
let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
|
||||
tracing::info!("Waiting for initial logical sizes while warming up...");
|
||||
while futs.next().await.is_some() {
|
||||
|
||||
}
|
||||
while futs.next().await.is_some() {}
|
||||
tracing::info!("Warm-up complete");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
|
||||
);
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn preload(
|
||||
self: &Arc<Tenant>,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<TenantPreload> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
// Get list of remote timelines
|
||||
// download index files for every tenant timeline
|
||||
info!("listing remote timelines");
|
||||
@@ -1376,7 +1362,7 @@ impl Tenant {
|
||||
async move {
|
||||
debug!("starting index part download");
|
||||
|
||||
let index_part = client.download_index_file(cancel_clone).await;
|
||||
let index_part = client.download_index_file(&cancel_clone).await;
|
||||
|
||||
debug!("finished index part download");
|
||||
|
||||
@@ -2397,6 +2383,67 @@ impl Tenant {
|
||||
pub(crate) fn get_generation(&self) -> Generation {
|
||||
self.generation
|
||||
}
|
||||
|
||||
/// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
|
||||
/// and can leave the tenant in a bad state if it fails. The caller is responsible for
|
||||
/// resetting this tenant to a valid state if we fail.
|
||||
pub(crate) async fn split_prepare(
|
||||
&self,
|
||||
child_shards: &Vec<TenantShardId>,
|
||||
) -> anyhow::Result<()> {
|
||||
let timelines = self.timelines.lock().unwrap().clone();
|
||||
for timeline in timelines.values() {
|
||||
let Some(tl_client) = &timeline.remote_client else {
|
||||
anyhow::bail!("Remote storage is mandatory");
|
||||
};
|
||||
|
||||
let Some(remote_storage) = &self.remote_storage else {
|
||||
anyhow::bail!("Remote storage is mandatory");
|
||||
};
|
||||
|
||||
// We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
|
||||
// to ensure that they do not start a split if currently in the process of doing these.
|
||||
|
||||
// Upload an index from the parent: this is partly to provide freshness for the
|
||||
// child tenants that will copy it, and partly for general ease-of-debugging: there will
|
||||
// always be a parent shard index in the same generation as we wrote the child shard index.
|
||||
tl_client.schedule_index_upload_for_file_changes()?;
|
||||
tl_client.wait_completion().await?;
|
||||
|
||||
// Shut down the timeline's remote client: this means that the indices we write
|
||||
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||
tl_client.shutdown().await?;
|
||||
|
||||
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
||||
// operation is rare, so it's simpler to just download it (and robustly guarantees that the index
|
||||
// we use here really is the remotely persistent one).
|
||||
let result = tl_client
|
||||
.download_index_file(&self.cancel)
|
||||
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
|
||||
.await?;
|
||||
let index_part = match result {
|
||||
MaybeDeletedIndexPart::Deleted(_) => {
|
||||
anyhow::bail!("Timeline deletion happened concurrently with split")
|
||||
}
|
||||
MaybeDeletedIndexPart::IndexPart(p) => p,
|
||||
};
|
||||
|
||||
for child_shard in child_shards {
|
||||
upload_index_part(
|
||||
remote_storage,
|
||||
child_shard,
|
||||
&timeline.timeline_id,
|
||||
self.generation,
|
||||
&index_part,
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
|
||||
@@ -2834,7 +2881,7 @@ impl Tenant {
|
||||
let config_path = config_path.to_owned();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
Handle::current().block_on(async move {
|
||||
let conf_content = conf_content.as_bytes();
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -2871,7 +2918,7 @@ impl Tenant {
|
||||
let target_config_path = target_config_path.to_owned();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
Handle::current().block_on(async move {
|
||||
let conf_content = conf_content.as_bytes();
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -3229,7 +3276,7 @@ impl Tenant {
|
||||
|
||||
/// For unit tests, make this visible so that other modules can directly create timelines
|
||||
#[cfg(test)]
|
||||
#[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn bootstrap_timeline_test(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
@@ -3293,7 +3340,7 @@ impl Tenant {
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
@@ -3343,8 +3390,10 @@ impl Tenant {
|
||||
);
|
||||
let dest_path =
|
||||
&remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
|
||||
|
||||
// if this fails, it will get retried by retried control plane requests
|
||||
storage
|
||||
.copy_object(source_path, dest_path)
|
||||
.copy_object(source_path, dest_path, &self.cancel)
|
||||
.await
|
||||
.context("copy initdb tar")?;
|
||||
}
|
||||
@@ -3732,6 +3781,10 @@ impl Tenant {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.read().unwrap().tenant_conf
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_timeline_and_uninit_mark(
|
||||
@@ -3864,6 +3917,7 @@ pub(crate) mod harness {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::walredo::apply_neon;
|
||||
use crate::{
|
||||
config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
|
||||
};
|
||||
@@ -3879,8 +3933,7 @@ pub(crate) mod harness {
|
||||
TimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
pub fn TEST_IMG(s: &str) -> Bytes {
|
||||
pub fn test_img(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(64, 0);
|
||||
@@ -3916,6 +3969,8 @@ pub(crate) mod harness {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Debug)]
|
||||
enum LoadMode {
|
||||
Local,
|
||||
Remote,
|
||||
@@ -3978,6 +4033,7 @@ pub(crate) mod harness {
|
||||
std::fs::create_dir_all(&remote_fs_dir).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
|
||||
@@ -3998,7 +4054,7 @@ pub(crate) mod harness {
|
||||
info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
||||
}
|
||||
|
||||
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
(
|
||||
self.try_load(&ctx)
|
||||
@@ -4008,6 +4064,74 @@ pub(crate) mod harness {
|
||||
)
|
||||
}
|
||||
|
||||
/// For tests that specifically want to exercise the local load path, which does
|
||||
/// not use remote storage.
|
||||
pub(crate) async fn try_load_local(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
self.do_try_load(ctx, LoadMode::Local).await
|
||||
}
|
||||
|
||||
/// The 'load' in this function is either a local load or a normal attachment,
|
||||
pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||
// If we have nothing in remote storage, must use load_local instead of attach: attach
|
||||
// will error out if there are no timelines.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
|
||||
// this weird state of a Tenant which exists but doesn't have any timelines.
|
||||
let mode = match self.remote_empty() {
|
||||
true => LoadMode::Local,
|
||||
false => LoadMode::Remote,
|
||||
};
|
||||
|
||||
self.do_try_load(ctx, mode).await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
|
||||
async fn do_try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
mode: LoadMode,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Loading,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
self.generation,
|
||||
&ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
// This is a legacy/test code path: sharding isn't supported here.
|
||||
ShardIdentity::unsharded(),
|
||||
Some(walredo_mgr),
|
||||
self.tenant_shard_id,
|
||||
Some(self.remote_storage.clone()),
|
||||
self.deletion_queue.new_client(),
|
||||
));
|
||||
|
||||
match mode {
|
||||
LoadMode::Local => {
|
||||
tenant.load_local(ctx).await?;
|
||||
}
|
||||
LoadMode::Remote => {
|
||||
let preload = tenant
|
||||
.preload(&self.remote_storage, CancellationToken::new())
|
||||
.await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
|
||||
}
|
||||
}
|
||||
|
||||
tenant.state.send_replace(TenantState::Active);
|
||||
for timeline in tenant.timelines.lock().unwrap().values() {
|
||||
timeline.set_state(TimelineState::Active);
|
||||
}
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
fn remote_empty(&self) -> bool {
|
||||
let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
|
||||
let remote_tenant_dir = self
|
||||
@@ -4033,77 +4157,6 @@ pub(crate) mod harness {
|
||||
}
|
||||
}
|
||||
|
||||
async fn do_try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
mode: LoadMode,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Loading,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
self.generation,
|
||||
&ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
// This is a legacy/test code path: sharding isn't supported here.
|
||||
ShardIdentity::unsharded(),
|
||||
Some(walredo_mgr),
|
||||
self.tenant_shard_id,
|
||||
Some(self.remote_storage.clone()),
|
||||
self.deletion_queue.new_client(),
|
||||
));
|
||||
|
||||
match mode {
|
||||
LoadMode::Local => {
|
||||
tenant
|
||||
.load_local(ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
LoadMode::Remote => {
|
||||
let preload = tenant
|
||||
.preload(&self.remote_storage, CancellationToken::new())
|
||||
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
tenant
|
||||
.attach(Some(preload), SpawnMode::Normal, ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
tenant.state.send_replace(TenantState::Active);
|
||||
for timeline in tenant.timelines.lock().unwrap().values() {
|
||||
timeline.set_state(TimelineState::Active);
|
||||
}
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
/// For tests that specifically want to exercise the local load path, which does
|
||||
/// not use remote storage.
|
||||
pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||
self.do_try_load(ctx, LoadMode::Local).await
|
||||
}
|
||||
|
||||
/// The 'load' in this function is either a local load or a normal attachment,
|
||||
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||
// If we have nothing in remote storage, must use load_local instead of attach: attach
|
||||
// will error out if there are no timelines.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
|
||||
// this weird state of a Tenant which exists but doesn't have any timelines.
|
||||
let mode = match self.remote_empty() {
|
||||
true => LoadMode::Local,
|
||||
false => LoadMode::Remote,
|
||||
};
|
||||
|
||||
self.do_try_load(ctx, mode).await
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||
self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
|
||||
}
|
||||
@@ -4124,20 +4177,33 @@ pub(crate) mod harness {
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let s = format!(
|
||||
"redo for {} to get to {}, with {} and {} records",
|
||||
key,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{s}");
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
if records_neon {
|
||||
// For Neon wal records, we can decode without spawning postgres, so do so.
|
||||
let base_img = base_img.expect("Neon WAL redo requires base image").1;
|
||||
let mut page = BytesMut::new();
|
||||
page.extend_from_slice(&base_img);
|
||||
for (_record_lsn, record) in records {
|
||||
apply_neon::apply_in_neon(&record, key, &mut page)?;
|
||||
}
|
||||
Ok(page.freeze())
|
||||
} else {
|
||||
// We never spawn a postgres walredo process in unit tests: just log what we might have done.
|
||||
let s = format!(
|
||||
"redo for {} to get to {}, with {} and {} records",
|
||||
key,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{s}");
|
||||
|
||||
Ok(TEST_IMG(&s))
|
||||
Ok(test_img(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4172,7 +4238,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x10),
|
||||
&Value::Image(TEST_IMG("foo at 0x10")),
|
||||
&Value::Image(test_img("foo at 0x10")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4184,7 +4250,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x20),
|
||||
&Value::Image(TEST_IMG("foo at 0x20")),
|
||||
&Value::Image(test_img("foo at 0x20")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4193,15 +4259,15 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
|
||||
TEST_IMG("foo at 0x20")
|
||||
test_img("foo at 0x20")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -4310,7 +4376,6 @@ mod tests {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
let writer = tline.writer().await;
|
||||
// Create a relation on the timeline
|
||||
@@ -4318,7 +4383,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4328,7 +4393,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4342,7 +4407,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4352,7 +4417,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4507,7 +4572,7 @@ mod tests {
|
||||
// Broken, as long as you don't need to access data from the parent.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x70)))
|
||||
test_img(&format!("foo at {}", Lsn(0x70)))
|
||||
);
|
||||
|
||||
// This needs to traverse to the parent, and fails.
|
||||
@@ -4584,7 +4649,7 @@ mod tests {
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
|
||||
test_img(&format!("foo at {}", Lsn(0x40)))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -4759,7 +4824,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x10),
|
||||
&Value::Image(TEST_IMG("foo at 0x10")),
|
||||
&Value::Image(test_img("foo at 0x10")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4776,7 +4841,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x20),
|
||||
&Value::Image(TEST_IMG("foo at 0x20")),
|
||||
&Value::Image(test_img("foo at 0x20")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4793,7 +4858,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x30),
|
||||
&Value::Image(TEST_IMG("foo at 0x30")),
|
||||
&Value::Image(test_img("foo at 0x30")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4810,7 +4875,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x40),
|
||||
&Value::Image(TEST_IMG("foo at 0x40")),
|
||||
&Value::Image(test_img("foo at 0x40")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4824,23 +4889,23 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
|
||||
TEST_IMG("foo at 0x20")
|
||||
test_img("foo at 0x20")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
|
||||
TEST_IMG("foo at 0x30")
|
||||
test_img("foo at 0x30")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
|
||||
TEST_IMG("foo at 0x40")
|
||||
test_img("foo at 0x40")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -4867,7 +4932,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5038,7 +5103,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5059,7 +5124,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5073,7 +5138,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5127,7 +5192,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5156,7 +5221,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5171,7 +5236,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5233,7 +5298,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5255,7 +5320,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, *lsn, &ctx).await?,
|
||||
TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
|
||||
test_img(&format!("{idx} {blknum} at {lsn}"))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
//! len < 128: 0XXXXXXX
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use tokio_epoll_uring::{BoundedBuf, Slice};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
@@ -100,6 +103,8 @@ pub struct BlobWriter<const BUFFERED: bool> {
|
||||
offset: u64,
|
||||
/// A buffer to save on write calls, only used if BUFFERED=true
|
||||
buf: Vec<u8>,
|
||||
/// We do tiny writes for the length headers; they need to be in an owned buffer;
|
||||
io_buf: Option<BytesMut>,
|
||||
}
|
||||
|
||||
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
@@ -108,6 +113,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
inner,
|
||||
offset: start_offset,
|
||||
buf: Vec::with_capacity(Self::CAPACITY),
|
||||
io_buf: Some(BytesMut::new()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,21 +123,31 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
|
||||
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
|
||||
|
||||
#[inline(always)]
|
||||
/// Writes the given buffer directly to the underlying `VirtualFile`.
|
||||
/// You need to make sure that the internal buffer is empty, otherwise
|
||||
/// data will be written in wrong order.
|
||||
async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
|
||||
self.inner.write_all(src_buf).await?;
|
||||
self.offset += src_buf.len() as u64;
|
||||
Ok(())
|
||||
#[inline(always)]
|
||||
async fn write_all_unbuffered<B: BoundedBuf>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let (src_buf, res) = self.inner.write_all(src_buf).await;
|
||||
let nbytes = match res {
|
||||
Ok(nbytes) => nbytes,
|
||||
Err(e) => return (src_buf, Err(e)),
|
||||
};
|
||||
self.offset += nbytes as u64;
|
||||
(src_buf, Ok(()))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
|
||||
self.inner.write_all(&self.buf).await?;
|
||||
self.buf.clear();
|
||||
let buf = std::mem::take(&mut self.buf);
|
||||
let (mut buf, res) = self.inner.write_all(buf).await;
|
||||
res?;
|
||||
buf.clear();
|
||||
self.buf = buf;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -146,62 +162,91 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
}
|
||||
|
||||
/// Internal, possibly buffered, write function
|
||||
async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
|
||||
async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
|
||||
if !BUFFERED {
|
||||
assert!(self.buf.is_empty());
|
||||
self.write_all_unbuffered(src_buf).await?;
|
||||
return Ok(());
|
||||
return self.write_all_unbuffered(src_buf).await;
|
||||
}
|
||||
let remaining = Self::CAPACITY - self.buf.len();
|
||||
let src_buf_len = src_buf.bytes_init();
|
||||
if src_buf_len == 0 {
|
||||
return (Slice::into_inner(src_buf.slice_full()), Ok(()));
|
||||
}
|
||||
let mut src_buf = src_buf.slice(0..src_buf_len);
|
||||
// First try to copy as much as we can into the buffer
|
||||
if remaining > 0 {
|
||||
let copied = self.write_into_buffer(src_buf);
|
||||
src_buf = &src_buf[copied..];
|
||||
let copied = self.write_into_buffer(&src_buf);
|
||||
src_buf = src_buf.slice(copied..);
|
||||
}
|
||||
// Then, if the buffer is full, flush it out
|
||||
if self.buf.len() == Self::CAPACITY {
|
||||
self.flush_buffer().await?;
|
||||
if let Err(e) = self.flush_buffer().await {
|
||||
return (Slice::into_inner(src_buf), Err(e));
|
||||
}
|
||||
}
|
||||
// Finally, write the tail of src_buf:
|
||||
// If it wholly fits into the buffer without
|
||||
// completely filling it, then put it there.
|
||||
// If not, write it out directly.
|
||||
if !src_buf.is_empty() {
|
||||
let src_buf = if !src_buf.is_empty() {
|
||||
assert_eq!(self.buf.len(), 0);
|
||||
if src_buf.len() < Self::CAPACITY {
|
||||
let copied = self.write_into_buffer(src_buf);
|
||||
let copied = self.write_into_buffer(&src_buf);
|
||||
// We just verified above that src_buf fits into our internal buffer.
|
||||
assert_eq!(copied, src_buf.len());
|
||||
Slice::into_inner(src_buf)
|
||||
} else {
|
||||
self.write_all_unbuffered(src_buf).await?;
|
||||
let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
|
||||
if let Err(e) = res {
|
||||
return (src_buf, Err(e));
|
||||
}
|
||||
src_buf
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
Slice::into_inner(src_buf)
|
||||
};
|
||||
(src_buf, Ok(()))
|
||||
}
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
/// which can be used to retrieve the data later.
|
||||
pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
||||
pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
|
||||
let offset = self.offset;
|
||||
|
||||
if srcbuf.len() < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
let len_buf = srcbuf.len() as u8;
|
||||
self.write_all(&[len_buf]).await?;
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if srcbuf.len() > 0x7fff_ffff {
|
||||
return Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({} bytes)", srcbuf.len()),
|
||||
));
|
||||
let len = srcbuf.bytes_init();
|
||||
|
||||
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
||||
io_buf.clear();
|
||||
let (io_buf, hdr_res) = async {
|
||||
if len < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
io_buf.put_u8(len as u8);
|
||||
self.write_all(io_buf).await
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if len > 0x7fff_ffff {
|
||||
return (
|
||||
io_buf,
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({} bytes)", len),
|
||||
)),
|
||||
);
|
||||
}
|
||||
let mut len_buf = (len as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
self.write_all(io_buf).await
|
||||
}
|
||||
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
self.write_all(&len_buf).await?;
|
||||
}
|
||||
self.write_all(srcbuf).await?;
|
||||
Ok(offset)
|
||||
.await;
|
||||
self.io_buf = Some(io_buf);
|
||||
match hdr_res {
|
||||
Ok(_) => (),
|
||||
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
||||
}
|
||||
let (srcbuf, res) = self.write_all(srcbuf).await;
|
||||
(srcbuf, res.map(|_| offset))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,12 +293,14 @@ mod tests {
|
||||
let file = VirtualFile::create(pathbuf.as_path()).await?;
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let offs = wtr.write_blob(blob).await?;
|
||||
let (_, res) = wtr.write_blob(blob.clone()).await;
|
||||
let offs = res?;
|
||||
offsets.push(offs);
|
||||
}
|
||||
// Write out one page worth of zeros so that we can
|
||||
// read again with read_blk
|
||||
let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
|
||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
|
||||
let offs = res?;
|
||||
println!("Writing final blob at offs={offs}");
|
||||
wtr.flush_buffer().await?;
|
||||
}
|
||||
|
||||
@@ -3,10 +3,10 @@ use std::sync::Arc;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::{models::TenantState, shard::TenantShardId};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, Instrument, Span};
|
||||
use tracing::{error, instrument, Instrument};
|
||||
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
@@ -84,17 +84,17 @@ async fn create_remote_delete_mark(
|
||||
let data = bytes::Bytes::from_static(data);
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||
remote_storage
|
||||
.upload(stream, 0, &remote_mark_path, None)
|
||||
.upload(stream, 0, &remote_mark_path, None, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("mark_upload")?;
|
||||
|
||||
@@ -184,15 +184,15 @@ async fn remove_tenant_remote_delete_mark(
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path).await },
|
||||
|_e| false,
|
||||
|| async { remote_storage.delete(&path, cancel).await },
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
@@ -496,11 +496,7 @@ impl DeleteTenantFlow {
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
.instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ use crate::{
|
||||
pub const VALUE_SZ: usize = 5;
|
||||
pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub const PAGE_SZ: usize = 8192;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
||||
@@ -6,6 +6,7 @@ use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::cmp::min;
|
||||
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
|
||||
/// An ephemeral file is append-only.
|
||||
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
|
||||
/// The other pages, which can no longer be modified, are accessed through the page cache.
|
||||
mutable_tail: [u8; PAGE_SZ],
|
||||
///
|
||||
/// None <=> IO is ongoing.
|
||||
/// Size is fixed to PAGE_SZ at creation time and must not be changed.
|
||||
mutable_tail: Option<BytesMut>,
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
@@ -60,7 +64,7 @@ impl EphemeralFile {
|
||||
_timeline_id: timeline_id,
|
||||
file,
|
||||
len: 0,
|
||||
mutable_tail: [0u8; PAGE_SZ],
|
||||
mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -103,7 +107,13 @@ impl EphemeralFile {
|
||||
};
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
Ok(BlockLease::EphemeralFileMutableTail(
|
||||
self.mutable_tail
|
||||
.as_deref()
|
||||
.expect("we're not doing IO, it must be Some()")
|
||||
.try_into()
|
||||
.expect("we ensure that it's always PAGE_SZ"),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,21 +145,27 @@ impl EphemeralFile {
|
||||
) -> Result<(), io::Error> {
|
||||
let mut src_remaining = src;
|
||||
while !src_remaining.is_empty() {
|
||||
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
|
||||
let dst_remaining = &mut self
|
||||
.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not yet ongoing")[self.off..];
|
||||
let n = min(dst_remaining.len(), src_remaining.len());
|
||||
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
|
||||
self.off += n;
|
||||
src_remaining = &src_remaining[n..];
|
||||
if self.off == PAGE_SZ {
|
||||
match self
|
||||
let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
|
||||
.expect("IO is not yet ongoing");
|
||||
let (mutable_tail, res) = self
|
||||
.ephemeral_file
|
||||
.file
|
||||
.write_all_at(
|
||||
&self.ephemeral_file.mutable_tail,
|
||||
self.blknum as u64 * PAGE_SZ as u64,
|
||||
)
|
||||
.await
|
||||
{
|
||||
.write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
|
||||
.await;
|
||||
// TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
|
||||
// I.e., the IO isn't retryable if we panic.
|
||||
self.ephemeral_file.mutable_tail = Some(mutable_tail);
|
||||
match res {
|
||||
Ok(_) => {
|
||||
// Pre-warm the page cache with what we just wrote.
|
||||
// This isn't necessary for coherency/correctness, but it's how we've always done it.
|
||||
@@ -169,7 +185,12 @@ impl EphemeralFile {
|
||||
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
|
||||
buf.copy_from_slice(
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref()
|
||||
.expect("IO is not ongoing"),
|
||||
);
|
||||
let _ = write_guard.mark_valid();
|
||||
// pre-warm successful
|
||||
}
|
||||
@@ -181,7 +202,11 @@ impl EphemeralFile {
|
||||
// Zero the buffer for re-use.
|
||||
// Zeroing is critical for correcntess because the write_blob code below
|
||||
// and similarly read_blk expect zeroed pages.
|
||||
self.ephemeral_file.mutable_tail.fill(0);
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not ongoing")
|
||||
.fill(0);
|
||||
// This block is done, move to next one.
|
||||
self.blknum += 1;
|
||||
self.off = 0;
|
||||
|
||||
@@ -279,7 +279,7 @@ pub async fn save_metadata(
|
||||
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
|
||||
.await
|
||||
.context("write metadata")?;
|
||||
Ok(())
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
@@ -22,7 +24,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::crashsafe;
|
||||
use utils::{completion, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -644,8 +646,6 @@ pub(crate) async fn shutdown_all_tenants() {
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
use utils::completion;
|
||||
|
||||
let mut join_set = JoinSet::new();
|
||||
|
||||
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
|
||||
@@ -1200,7 +1200,7 @@ impl TenantManager {
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
drop_cache: bool,
|
||||
ctx: RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
let Some(old_slot) = slot_guard.get_old_value() else {
|
||||
@@ -1253,7 +1253,7 @@ impl TenantManager {
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Normal,
|
||||
&ctx,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
@@ -1375,6 +1375,309 @@ impl TenantManager {
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
|
||||
if new_shard_count <= ShardCount(effective_old_shard_count) {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
|
||||
if !expansion_factor.is_power_of_two() {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
|
||||
let child_shards = tenant_shard_id.split(new_shard_count);
|
||||
tracing::info!(
|
||||
"Shard {} splits into: {}",
|
||||
tenant_shard_id.to_index(),
|
||||
child_shards
|
||||
.iter()
|
||||
.map(|id| format!("{}", id.to_index()))
|
||||
.join(",")
|
||||
);
|
||||
|
||||
// Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
|
||||
if let Err(e) = tenant.split_prepare(&child_shards).await {
|
||||
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
|
||||
// have been left in a partially-shut-down state.
|
||||
tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
|
||||
self.reset_tenant(tenant_shard_id, false, ctx).await?;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
drop(tenant);
|
||||
let mut parent_slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
let parent = match parent_slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(t)) => t,
|
||||
Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// tenant_map_acquire_slot never returns InProgress, if a slot was InProgress
|
||||
// it would return an error.
|
||||
unreachable!()
|
||||
}
|
||||
None => {
|
||||
// We don't actually need the parent shard to still be attached to do our work, but it's
|
||||
// a weird enough situation that the caller probably didn't want us to continue working
|
||||
// if they had detached the tenant they requested the split on.
|
||||
anyhow::bail!("Detached parent shard in the middle of split!")
|
||||
}
|
||||
};
|
||||
|
||||
// Optimization: hardlink layers from the parent into the children, so that they don't have to
|
||||
// re-download & duplicate the data referenced in their initial IndexPart
|
||||
self.shard_split_hardlink(parent, child_shards.clone())
|
||||
.await?;
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
let mut target_lsns = HashMap::new();
|
||||
for timeline in parent.timelines.lock().unwrap().clone().values() {
|
||||
target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
|
||||
}
|
||||
|
||||
// TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
|
||||
// and could slow down the children trying to catch up.
|
||||
|
||||
// Phase 3: Spawn the child shards
|
||||
for child_shard in &child_shards {
|
||||
let mut child_shard_identity = parent_shard_identity;
|
||||
child_shard_identity.count = child_shard.shard_count;
|
||||
child_shard_identity.number = child_shard.shard_number;
|
||||
|
||||
let child_location_conf = LocationConf {
|
||||
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: parent_generation,
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
shard: child_shard_identity,
|
||||
tenant_conf: parent_tenant_conf,
|
||||
};
|
||||
|
||||
self.upsert_location(
|
||||
*child_shard,
|
||||
child_location_conf,
|
||||
None,
|
||||
SpawnMode::Normal,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
let child_shard = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
|
||||
peek_slot.and_then(|s| s.get_attached()).cloned()
|
||||
};
|
||||
if let Some(t) = child_shard {
|
||||
let timelines = t.timelines.lock().unwrap().clone();
|
||||
for timeline in timelines.values() {
|
||||
let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Waiting for child shard {}/{} to reach target lsn {}...",
|
||||
child_shard_id,
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
tracing::warn!(
|
||||
"Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Child shard {}/{} reached target lsn {}",
|
||||
child_shard_id,
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 5: Shut down the parent shard, and erase it from disk
|
||||
let (_guard, progress) = completion::channel();
|
||||
match parent.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(other) => {
|
||||
other.wait().await;
|
||||
}
|
||||
}
|
||||
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
None,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
drop(parent_slot_guard);
|
||||
|
||||
Ok(child_shards)
|
||||
}
|
||||
|
||||
/// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
|
||||
/// to avoid the children downloading them again.
|
||||
///
|
||||
/// For each resident layer in the parent shard, we will hard link it into all of the child shards.
|
||||
async fn shard_split_hardlink(
|
||||
&self,
|
||||
parent_shard: &Tenant,
|
||||
child_shards: Vec<TenantShardId>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
|
||||
let (parent_timelines, parent_layers) = {
|
||||
let mut parent_layers = Vec::new();
|
||||
let timelines = parent_shard.timelines.lock().unwrap().clone();
|
||||
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
|
||||
for timeline in timelines.values() {
|
||||
let timeline_layers = timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.resident_layers()
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
for layer in timeline_layers {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
.strip_prefix(&parent_path)
|
||||
.context("Removing prefix from parent layer path")?;
|
||||
parent_layers.push(relative_path.to_owned());
|
||||
}
|
||||
}
|
||||
debug_assert!(
|
||||
!parent_layers.is_empty(),
|
||||
"shutdown cannot empty the layermap"
|
||||
);
|
||||
(parent_timelines, parent_layers)
|
||||
};
|
||||
|
||||
let mut child_prefixes = Vec::new();
|
||||
let mut create_dirs = Vec::new();
|
||||
|
||||
for child in child_shards {
|
||||
let child_prefix = self.conf.tenant_path(&child);
|
||||
create_dirs.push(child_prefix.clone());
|
||||
create_dirs.extend(
|
||||
parent_timelines
|
||||
.iter()
|
||||
.map(|t| self.conf.timeline_path(&child, t)),
|
||||
);
|
||||
|
||||
child_prefixes.push(child_prefix);
|
||||
}
|
||||
|
||||
// Since we will do a large number of small filesystem metadata operations, batch them into
|
||||
// spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
|
||||
let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
|
||||
for dir in &create_dirs {
|
||||
if let Err(e) = std::fs::create_dir_all(dir) {
|
||||
// Ignore AlreadyExists errors, drop out on all other errors
|
||||
match e.kind() {
|
||||
std::io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for child_prefix in child_prefixes {
|
||||
for relative_layer in &parent_layers {
|
||||
let parent_path = parent_path.join(relative_layer);
|
||||
let child_path = child_prefix.join(relative_layer);
|
||||
if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
|
||||
match e.kind() {
|
||||
std::io::ErrorKind::AlreadyExists => {}
|
||||
std::io::ErrorKind::NotFound => {
|
||||
tracing::info!(
|
||||
"Layer {} not found during hard-linking, evicted during split?",
|
||||
relative_layer
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(e).context(format!(
|
||||
"Hard linking {relative_layer} into {child_prefix}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Durability is not required for correctness, but if we crashed during split and
|
||||
// then came restarted with empty timeline dirs, it would be very inefficient to
|
||||
// re-populate from remote storage.
|
||||
for dir in create_dirs {
|
||||
if let Err(e) = crashsafe::fsync(&dir) {
|
||||
// Something removed a newly created timeline dir out from underneath us? Extremely
|
||||
// unexpected, but not worth panic'ing over as this whole function is just an
|
||||
// optimization.
|
||||
tracing::warn!("Failed to fsync directory {dir}: {e}")
|
||||
}
|
||||
}
|
||||
|
||||
Ok(parent_layers.len())
|
||||
});
|
||||
|
||||
match jh.await {
|
||||
Ok(Ok(layer_count)) => {
|
||||
tracing::info!(count = layer_count, "Hard linked layers into child shards");
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
// This is an optimization, so we tolerate failure.
|
||||
tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
|
||||
}
|
||||
Err(e) => {
|
||||
// This is something totally unexpected like a panic, so bail out.
|
||||
anyhow::bail!("Error joining hard linking task: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -2209,8 +2512,6 @@ async fn remove_tenant_from_memory<V, F>(
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
use utils::completion;
|
||||
|
||||
let mut slot_guard =
|
||||
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
|
||||
@@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use std::ops::DerefMut;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
@@ -217,6 +215,7 @@ use crate::metrics::{
|
||||
};
|
||||
use crate::task_mgr::shutdown_token;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::download::download_retry;
|
||||
use crate::tenant::storage_layer::AsLayerDesc;
|
||||
use crate::tenant::upload_queue::Delete;
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
@@ -325,45 +324,6 @@ pub struct RemoteTimelineClient {
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not
|
||||
/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
|
||||
const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||
const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||
|
||||
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
|
||||
///
|
||||
/// This is a convenience for the various upload functions. In future
|
||||
/// the anyhow::Error result should be replaced with a more structured type that
|
||||
/// enables callers to avoid handling shutdown as an error.
|
||||
async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<()>>,
|
||||
{
|
||||
match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
|
||||
Ok(Ok(())) => Ok(()),
|
||||
Ok(Err(e)) => Err(e),
|
||||
Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
|
||||
Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
|
||||
}
|
||||
}
|
||||
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
|
||||
async fn download_cancellable<F, R>(
|
||||
cancel: &CancellationToken,
|
||||
future: F,
|
||||
) -> Result<R, DownloadError>
|
||||
where
|
||||
F: std::future::Future<Output = Result<R, DownloadError>>,
|
||||
{
|
||||
match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
|
||||
Ok(Ok(r)) => Ok(r),
|
||||
Ok(Err(e)) => Err(e),
|
||||
Err(TimeoutCancellableError::Timeout) => {
|
||||
Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
|
||||
}
|
||||
Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
@@ -506,7 +466,7 @@ impl RemoteTimelineClient {
|
||||
/// Download index file
|
||||
pub async fn download_index_file(
|
||||
&self,
|
||||
cancel: CancellationToken,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
&RemoteOpFileKind::Index,
|
||||
@@ -1049,7 +1009,7 @@ impl RemoteTimelineClient {
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)?;
|
||||
|
||||
// all good, disarm the guard and mark as success
|
||||
@@ -1081,14 +1041,14 @@ impl RemoteTimelineClient {
|
||||
upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"preserve_initdb_tar_zst",
|
||||
&cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancellled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("backing up initdb archive")?;
|
||||
Ok(())
|
||||
@@ -1147,22 +1107,17 @@ impl RemoteTimelineClient {
|
||||
|
||||
let cancel = shutdown_token();
|
||||
|
||||
let remaining = backoff::retry(
|
||||
let remaining = download_retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list_files(Some(&timeline_storage_path))
|
||||
.list_files(Some(&timeline_storage_path), None, &cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"list_prefixes",
|
||||
"list remaining files",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled!"))
|
||||
.and_then(|x| x)
|
||||
.context("list prefixes")?;
|
||||
.context("list files remaining files")?;
|
||||
|
||||
// We will delete the current index_part object last, since it acts as a deletion
|
||||
// marker via its deleted_at attribute
|
||||
@@ -1351,6 +1306,7 @@ impl RemoteTimelineClient {
|
||||
/// queue.
|
||||
///
|
||||
async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
|
||||
let cancel = shutdown_token();
|
||||
// Loop to retry until it completes.
|
||||
loop {
|
||||
// If we're requested to shut down, close up shop and exit.
|
||||
@@ -1362,7 +1318,7 @@ impl RemoteTimelineClient {
|
||||
// the Future, but we're not 100% sure if the remote storage library
|
||||
// is cancellation safe, so we don't dare to do that. Hopefully, the
|
||||
// upload finishes or times out soon enough.
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
if cancel.is_cancelled() {
|
||||
info!("upload task cancelled by shutdown request");
|
||||
match self.stop() {
|
||||
Ok(()) => {}
|
||||
@@ -1448,6 +1404,10 @@ impl RemoteTimelineClient {
|
||||
Ok(()) => {
|
||||
break;
|
||||
}
|
||||
Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => {
|
||||
// loop around to do the proper stopping
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
@@ -1473,7 +1433,7 @@ impl RemoteTimelineClient {
|
||||
retries,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&shutdown_token(),
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
@@ -1703,23 +1663,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_metadata(
|
||||
&self,
|
||||
layers: Vec<LayerFileName>,
|
||||
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
|
||||
let q = self.upload_queue.lock().unwrap();
|
||||
let q = match &*q {
|
||||
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
|
||||
anyhow::bail!("queue is in state {}", q.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(inner) => inner,
|
||||
};
|
||||
|
||||
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
|
||||
|
||||
Ok(decorated.collect())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
@@ -1990,7 +1933,7 @@ mod tests {
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let initial_index_part = match client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.download_index_file(&CancellationToken::new())
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
@@ -2084,7 +2027,7 @@ mod tests {
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let index_part = match client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.download_index_file(&CancellationToken::new())
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
@@ -2286,7 +2229,7 @@ mod tests {
|
||||
let client = test_state.build_client(get_generation);
|
||||
|
||||
let download_r = client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.download_index_file(&CancellationToken::new())
|
||||
.await
|
||||
.expect("download should always succeed");
|
||||
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
|
||||
|
||||
@@ -11,16 +11,14 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::fs::{self, File, OpenOptions};
|
||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
@@ -83,15 +81,13 @@ pub async fn download_layer_file<'a>(
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
|
||||
// file: the write to local file doesn't start until after the request header is returned
|
||||
// and we start draining the body stream below
|
||||
let download = download_cancellable(cancel, storage.download(&remote_path))
|
||||
let download = storage
|
||||
.download(&remote_path, cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -100,43 +96,26 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
// Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
|
||||
// and we will unlink the temporary file if there is an error. This unlink is important because we
|
||||
// are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
|
||||
// we will imminiently try and write to again.
|
||||
let bytes_amount: u64 = match timeout_cancellable(
|
||||
DOWNLOAD_TIMEOUT,
|
||||
cancel,
|
||||
tokio::io::copy_buf(&mut reader, &mut destination_file),
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
Ok(b) => Ok(b),
|
||||
))
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
match bytes_amount {
|
||||
Ok(bytes_amount) => {
|
||||
let destination_file = destination_file.into_inner();
|
||||
Ok((destination_file, bytes_amount))
|
||||
}
|
||||
Err(e) => {
|
||||
// Remove incomplete files: on restart Timeline would do this anyway, but we must
|
||||
// do it here for the retry case.
|
||||
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
||||
}
|
||||
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let destination_file = destination_file.into_inner();
|
||||
|
||||
Ok((destination_file, bytes_amount))
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
@@ -216,16 +195,17 @@ pub async fn list_remote_timelines(
|
||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||
});
|
||||
|
||||
let cancel_inner = cancel.clone();
|
||||
let listing = download_retry_forever(
|
||||
|| {
|
||||
download_cancellable(
|
||||
&cancel_inner,
|
||||
storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
||||
storage.list(
|
||||
Some(&remote_path),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
},
|
||||
&format!("list timelines for {tenant_shard_id}"),
|
||||
cancel,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -258,29 +238,25 @@ async fn do_download_index_part(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let cancel_inner = cancel.clone();
|
||||
let index_part_bytes = download_retry_forever(
|
||||
|| async {
|
||||
// Cancellation: if is safe to cancel this future because we're just downloading into
|
||||
// a memory buffer, not touching local disk.
|
||||
let index_part_download =
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
||||
while let Some(chunk) = stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
index_part_bytes.extend_from_slice(&chunk[..]);
|
||||
}
|
||||
Ok(index_part_bytes)
|
||||
let mut bytes = Vec::new();
|
||||
|
||||
let stream = download.download_stream;
|
||||
let mut stream = StreamReader::new(stream);
|
||||
|
||||
tokio::io::copy_buf(&mut stream, &mut bytes)
|
||||
.await
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes)
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
@@ -288,7 +264,7 @@ async fn do_download_index_part(
|
||||
.await?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| format!("download index part file at {remote_path:?}"))
|
||||
.with_context(|| format!("deserialize index part file at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(index_part)
|
||||
@@ -305,7 +281,7 @@ pub(super) async fn download_index_part(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -325,14 +301,8 @@ pub(super) async fn download_index_part(
|
||||
// index in our generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
my_generation,
|
||||
cancel.clone(),
|
||||
)
|
||||
.await;
|
||||
let res =
|
||||
do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!(
|
||||
@@ -357,7 +327,7 @@ pub(super) async fn download_index_part(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
my_generation.previous(),
|
||||
cancel.clone(),
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
@@ -379,18 +349,13 @@ pub(super) async fn download_index_part(
|
||||
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
||||
// to constructing a full index path with no generation, because the generation is a suffix.
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
let indices = backoff::retry(
|
||||
|| async { storage.list_files(Some(&index_prefix)).await },
|
||||
|_| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"listing index_part files",
|
||||
&cancel,
|
||||
|
||||
let indices = download_retry(
|
||||
|| async { storage.list_files(Some(&index_prefix), None, cancel).await },
|
||||
"list index_part files",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.and_then(|x| x)
|
||||
.map_err(DownloadError::Other)?;
|
||||
.await?;
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
@@ -447,8 +412,6 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||
));
|
||||
|
||||
let cancel_inner = cancel.clone();
|
||||
|
||||
let file = download_retry(
|
||||
|| async {
|
||||
let file = OpenOptions::new()
|
||||
@@ -461,13 +424,10 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
|
||||
.await
|
||||
{
|
||||
let download = match storage.download(&remote_path, cancel).await {
|
||||
Ok(dl) => dl,
|
||||
Err(DownloadError::NotFound) => {
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
|
||||
.await?
|
||||
storage.download(&remote_preserved_path, cancel).await?
|
||||
}
|
||||
Err(other) => Err(other)?,
|
||||
};
|
||||
@@ -477,6 +437,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
// TODO: this consumption of the response body should be subject to timeout + cancellation, but
|
||||
// not without thinking carefully about how to recover safely from cancelling a write to
|
||||
// local storage (e.g. by writing into a temp file as we do in download_layer)
|
||||
// FIXME: flip the weird error wrapping
|
||||
tokio::io::copy_buf(&mut download, &mut writer)
|
||||
.await
|
||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||
@@ -516,7 +477,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
/// with backoff.
|
||||
///
|
||||
/// (See similar logic for uploads in `perform_upload_task`)
|
||||
async fn download_retry<T, O, F>(
|
||||
pub(super) async fn download_retry<T, O, F>(
|
||||
op: O,
|
||||
description: &str,
|
||||
cancel: &CancellationToken,
|
||||
@@ -527,7 +488,7 @@ where
|
||||
{
|
||||
backoff::retry(
|
||||
op,
|
||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
||||
DownloadError::is_permanent,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
description,
|
||||
@@ -541,7 +502,7 @@ where
|
||||
async fn download_retry_forever<T, O, F>(
|
||||
op: O,
|
||||
description: &str,
|
||||
cancel: CancellationToken,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
@@ -549,11 +510,11 @@ where
|
||||
{
|
||||
backoff::retry(
|
||||
op,
|
||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
||||
DownloadError::is_permanent,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
u32::MAX,
|
||||
description,
|
||||
&cancel,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| DownloadError::Cancelled)
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
|
||||
remote_initdb_preserved_archive_path, remote_path,
|
||||
},
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, TimeTravelError};
|
||||
@@ -27,7 +27,7 @@ use super::index::LayerFileMetadata;
|
||||
use tracing::info;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<'a>(
|
||||
pub(crate) async fn upload_index_part<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
@@ -49,16 +49,15 @@ pub(super) async fn upload_index_part<'a>(
|
||||
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
||||
upload_cancellable(
|
||||
cancel,
|
||||
storage.upload_storage_object(
|
||||
storage
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
|
||||
index_part_size,
|
||||
&remote_path,
|
||||
),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
@@ -115,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
|
||||
storage
|
||||
.upload(reader, fs_size, &storage_path, None, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
|
||||
|
||||
Ok(())
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))
|
||||
}
|
||||
|
||||
/// Uploads the given `initdb` data to the remote storage.
|
||||
@@ -139,12 +137,10 @@ pub(crate) async fn upload_initdb_dir(
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
|
||||
|
||||
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(
|
||||
cancel,
|
||||
storage.upload_storage_object(file, size as usize, &remote_path),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
storage
|
||||
.upload_storage_object(file, size as usize, &remote_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
@@ -155,7 +151,8 @@ pub(crate) async fn preserve_initdb_archive(
|
||||
) -> anyhow::Result<()> {
|
||||
let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
|
||||
storage
|
||||
.copy_object(&source_path, &dest_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
@@ -160,7 +160,7 @@ impl SecondaryTenant {
|
||||
&self.tenant_shard_id
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
|
||||
pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> (DiskUsageEvictionInfo, usize) {
|
||||
self.detail.lock().unwrap().get_layers_for_eviction(self)
|
||||
}
|
||||
|
||||
|
||||
@@ -146,14 +146,15 @@ impl SecondaryDetail {
|
||||
}
|
||||
}
|
||||
|
||||
/// Additionally returns the total number of layers, used for more stable relative access time
|
||||
/// based eviction.
|
||||
pub(super) fn get_layers_for_eviction(
|
||||
&self,
|
||||
parent: &Arc<SecondaryTenant>,
|
||||
) -> DiskUsageEvictionInfo {
|
||||
let mut result = DiskUsageEvictionInfo {
|
||||
max_layer_size: None,
|
||||
resident_layers: Vec::new(),
|
||||
};
|
||||
) -> (DiskUsageEvictionInfo, usize) {
|
||||
let mut result = DiskUsageEvictionInfo::default();
|
||||
let mut total_layers = 0;
|
||||
|
||||
for (timeline_id, timeline_detail) in &self.timelines {
|
||||
result
|
||||
.resident_layers
|
||||
@@ -169,6 +170,10 @@ impl SecondaryDetail {
|
||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||
}
|
||||
}));
|
||||
|
||||
// total might be missing currently downloading layers, but as a lower than actual
|
||||
// value it is good enough approximation.
|
||||
total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len();
|
||||
}
|
||||
result.max_layer_size = result
|
||||
.resident_layers
|
||||
@@ -183,7 +188,7 @@ impl SecondaryDetail {
|
||||
result.resident_layers.len()
|
||||
);
|
||||
|
||||
result
|
||||
(result, total_layers)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,9 +317,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
.tenant_manager
|
||||
.get_secondary_tenant_shard(*tenant_shard_id);
|
||||
let Some(tenant) = tenant else {
|
||||
{
|
||||
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
|
||||
}
|
||||
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
|
||||
};
|
||||
|
||||
Ok(PendingDownload {
|
||||
@@ -389,9 +392,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
}
|
||||
|
||||
CompleteDownload {
|
||||
secondary_state,
|
||||
completed_at: Instant::now(),
|
||||
}
|
||||
secondary_state,
|
||||
completed_at: Instant::now(),
|
||||
}
|
||||
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||
}
|
||||
}
|
||||
@@ -483,7 +486,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let heatmap_path_bg = heatmap_path.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
tokio::runtime::Handle::current().block_on(async move {
|
||||
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
|
||||
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
|
||||
})
|
||||
})
|
||||
.await
|
||||
@@ -520,24 +523,25 @@ impl<'a> TenantDownloader<'a> {
|
||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||
let cancel = &self.secondary_state.cancel;
|
||||
|
||||
let heatmap_bytes = backoff::retry(
|
||||
|| async {
|
||||
let download = self
|
||||
.remote_storage
|
||||
.download(&heatmap_path)
|
||||
.download(&heatmap_path, cancel)
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
Ok(heatmap_bytes)
|
||||
},
|
||||
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"download heatmap",
|
||||
&self.secondary_state.cancel,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UpdateError::Cancelled)
|
||||
|
||||
@@ -21,18 +21,17 @@ use futures::Future;
|
||||
use md5;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
|
||||
|
||||
use super::{
|
||||
heatmap::HeatMapTenant,
|
||||
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
|
||||
CommandRequest,
|
||||
CommandRequest, UploadCommand,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
|
||||
|
||||
use super::{heatmap::HeatMapTenant, UploadCommand};
|
||||
|
||||
pub(super) async fn heatmap_uploader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
@@ -417,10 +416,10 @@ async fn upload_tenant_heatmap(
|
||||
|| async {
|
||||
let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
|
||||
remote_storage
|
||||
.upload_storage_object(bytes, size, &path)
|
||||
.upload_storage_object(bytes, size, &path, cancel)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
3,
|
||||
u32::MAX,
|
||||
"Uploading heatmap",
|
||||
|
||||
@@ -539,6 +539,12 @@ impl LayerAccessStats {
|
||||
ret
|
||||
}
|
||||
|
||||
/// Get the latest access timestamp, falling back to latest residence event, further falling
|
||||
/// back to `SystemTime::now` for a usable timestamp for eviction.
|
||||
pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
|
||||
self.latest_activity().unwrap_or_else(SystemTime::now)
|
||||
}
|
||||
|
||||
/// Get the latest access timestamp, falling back to latest residence event.
|
||||
///
|
||||
/// This function can only return `None` if there has not yet been a call to the
|
||||
@@ -553,7 +559,7 @@ impl LayerAccessStats {
|
||||
/// that that type can only be produced by inserting into the layer map.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
|
||||
fn latest_activity(&self) -> Option<SystemTime> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
match inner.last_accesses.recent() {
|
||||
|
||||
@@ -422,27 +422,31 @@ impl DeltaLayerWriterInner {
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
||||
.await
|
||||
let (_, res) = self
|
||||
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
|
||||
.await;
|
||||
res
|
||||
}
|
||||
|
||||
async fn put_value_bytes(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
val: Vec<u8>,
|
||||
will_init: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
|
||||
let off = self.blob_writer.write_blob(val).await?;
|
||||
let (val, res) = self.blob_writer.write_blob(val).await;
|
||||
let off = match res {
|
||||
Ok(off) => off,
|
||||
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||
};
|
||||
|
||||
let blob_ref = BlobRef::new(off, will_init);
|
||||
|
||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||
self.tree.append(&delta_key.0, blob_ref.0)?;
|
||||
|
||||
Ok(())
|
||||
let res = self.tree.append(&delta_key.0, blob_ref.0);
|
||||
(val, res.map_err(|e| anyhow::anyhow!(e)))
|
||||
}
|
||||
|
||||
fn size(&self) -> u64 {
|
||||
@@ -463,7 +467,8 @@ impl DeltaLayerWriterInner {
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
||||
.await?;
|
||||
for buf in block_buf.blocks {
|
||||
file.write_all(buf.as_ref()).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
}
|
||||
assert!(self.lsn_range.start < self.lsn_range.end);
|
||||
// Fill in the summary on blk 0
|
||||
@@ -478,17 +483,12 @@ impl DeltaLayerWriterInner {
|
||||
index_root_blk,
|
||||
};
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
if buf.spilled() {
|
||||
// This is bad as we only have one free block for the summary
|
||||
warn!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
);
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
@@ -593,9 +593,9 @@ impl DeltaLayerWriter {
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &[u8],
|
||||
val: Vec<u8>,
|
||||
will_init: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
@@ -681,18 +681,12 @@ impl DeltaLayer {
|
||||
|
||||
let new_summary = rewrite(actual_summary);
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here, but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
if buf.spilled() {
|
||||
// The code in DeltaLayerWriterInner just warn!()s for this.
|
||||
// It should probably error out as well.
|
||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
)));
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -343,18 +343,12 @@ impl ImageLayer {
|
||||
|
||||
let new_summary = rewrite(actual_summary);
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
if buf.spilled() {
|
||||
// The code in ImageLayerWriterInner just warn!()s for this.
|
||||
// It should probably error out as well.
|
||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
)));
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -601,9 +595,11 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let off = self.blob_writer.write_blob(img).await?;
|
||||
let (_img, res) = self.blob_writer.write_blob(img).await;
|
||||
// TODO: re-use the buffer for `img` further upstack
|
||||
let off = res?;
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
@@ -626,7 +622,8 @@ impl ImageLayerWriterInner {
|
||||
.await?;
|
||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||
for buf in block_buf.blocks {
|
||||
file.write_all(buf.as_ref()).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
}
|
||||
|
||||
// Fill in the summary on blk 0
|
||||
@@ -641,17 +638,12 @@ impl ImageLayerWriterInner {
|
||||
index_root_blk,
|
||||
};
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
if buf.spilled() {
|
||||
// This is bad as we only have one free block for the summary
|
||||
warn!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
);
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
@@ -732,7 +724,7 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
||||
pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
|
||||
self.inner.as_mut().unwrap().put_image(key, img).await
|
||||
}
|
||||
|
||||
|
||||
@@ -474,9 +474,11 @@ impl InMemoryLayer {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
delta_layer_writer
|
||||
.put_value_bytes(key, *lsn, &buf, will_init)
|
||||
.await?;
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(key, *lsn, buf, will_init)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -325,8 +325,8 @@ impl Layer {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.0.info(reset).await
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.0.info(reset)
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
@@ -637,10 +637,10 @@ impl LayerInner {
|
||||
let mut rx = self.status.subscribe();
|
||||
|
||||
let strong = {
|
||||
match self.inner.get_mut().await {
|
||||
match self.inner.get() {
|
||||
Some(mut either) => {
|
||||
self.wanted_evicted.store(true, Ordering::Relaxed);
|
||||
ResidentOrWantedEvicted::downgrade(&mut either)
|
||||
either.downgrade()
|
||||
}
|
||||
None => return Err(EvictionError::NotFound),
|
||||
}
|
||||
@@ -666,7 +666,7 @@ impl LayerInner {
|
||||
// use however late (compared to the initial expressing of wanted) as the
|
||||
// "outcome" now
|
||||
LAYER_IMPL_METRICS.inc_broadcast_lagged();
|
||||
match self.inner.get_mut().await {
|
||||
match self.inner.get() {
|
||||
Some(_) => Err(EvictionError::Downloaded),
|
||||
None => Ok(()),
|
||||
}
|
||||
@@ -784,7 +784,7 @@ impl LayerInner {
|
||||
// use the already held initialization permit because it is impossible to hit the
|
||||
// below paths anymore essentially limiting the max loop iterations to 2.
|
||||
let (value, init_permit) = download(init_permit).await?;
|
||||
let mut guard = self.inner.set(value, init_permit).await;
|
||||
let mut guard = self.inner.set(value, init_permit);
|
||||
let (strong, _upgraded) = guard
|
||||
.get_and_upgrade()
|
||||
.expect("init creates strong reference, we held the init permit");
|
||||
@@ -792,7 +792,7 @@ impl LayerInner {
|
||||
}
|
||||
|
||||
let (weak, permit) = {
|
||||
let mut locked = self.inner.get_mut_or_init(download).await?;
|
||||
let mut locked = self.inner.get_or_init(download).await?;
|
||||
|
||||
if let Some((strong, upgraded)) = locked.get_and_upgrade() {
|
||||
if upgraded {
|
||||
@@ -1014,12 +1014,12 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.desc.filename().file_name();
|
||||
|
||||
// this is not accurate: we could have the file locally but there was a cancellation
|
||||
// and now we are not in sync, or we are currently downloading it.
|
||||
let remote = self.inner.get_mut().await.is_none();
|
||||
let remote = self.inner.get().is_none();
|
||||
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
@@ -1078,7 +1078,7 @@ impl LayerInner {
|
||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||
return;
|
||||
};
|
||||
match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
|
||||
match this.evict_blocking(version) {
|
||||
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
||||
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
||||
}
|
||||
@@ -1086,7 +1086,7 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
||||
fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
||||
// deleted or detached timeline, don't do anything.
|
||||
let Some(timeline) = self.timeline.upgrade() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
@@ -1095,7 +1095,7 @@ impl LayerInner {
|
||||
// to avoid starting a new download while we evict, keep holding on to the
|
||||
// permit.
|
||||
let _permit = {
|
||||
let maybe_downloaded = self.inner.get_mut().await;
|
||||
let maybe_downloaded = self.inner.get();
|
||||
|
||||
let (_weak, permit) = match maybe_downloaded {
|
||||
Some(mut guard) => {
|
||||
@@ -1460,10 +1460,6 @@ impl ResidentLayer {
|
||||
&self.owner.0.path
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
self.owner.access_stats()
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.owner.metadata()
|
||||
}
|
||||
|
||||
@@ -12,7 +12,9 @@ use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
keyspace::KeySpaceAccum,
|
||||
models::{
|
||||
@@ -33,17 +35,22 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::sync::gate::Gate;
|
||||
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{
|
||||
array,
|
||||
collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
|
||||
sync::atomic::AtomicU64,
|
||||
};
|
||||
use std::{
|
||||
cmp::{max, min, Ordering},
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::pgdatadir_mapping::DirectoryKind;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -108,10 +115,7 @@ use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{
|
||||
remote_timeline_client::index::{IndexLayerMetadata, IndexPart},
|
||||
storage_layer::LayerFringe,
|
||||
};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub(super) enum FlushLoopState {
|
||||
@@ -260,6 +264,8 @@ pub struct Timeline {
|
||||
// in `crate::page_service` writes these metrics.
|
||||
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
|
||||
|
||||
directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
|
||||
|
||||
/// Ensures layers aren't frozen by checkpointer between
|
||||
/// [`Timeline::get_layer_for_write`] and layer reads.
|
||||
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
||||
@@ -945,6 +951,10 @@ impl Timeline {
|
||||
self.metrics.resident_physical_size_get()
|
||||
}
|
||||
|
||||
pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
|
||||
array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
|
||||
}
|
||||
|
||||
///
|
||||
/// Wait until WAL has been received and processed up to this LSN.
|
||||
///
|
||||
@@ -1424,7 +1434,7 @@ impl Timeline {
|
||||
let mut historic_layers = Vec::new();
|
||||
for historic_layer in layer_map.iter_historic_layers() {
|
||||
let historic_layer = guard.get_from_desc(&historic_layer);
|
||||
historic_layers.push(historic_layer.info(reset).await);
|
||||
historic_layers.push(historic_layer.info(reset));
|
||||
}
|
||||
|
||||
LayerMapInfo {
|
||||
@@ -1614,7 +1624,7 @@ impl Timeline {
|
||||
generation,
|
||||
shard_identity,
|
||||
pg_version,
|
||||
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
||||
layers: Default::default(),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
|
||||
walredo_mgr,
|
||||
@@ -1651,6 +1661,8 @@ impl Timeline {
|
||||
&timeline_id,
|
||||
),
|
||||
|
||||
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
|
||||
|
||||
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
||||
|
||||
layer_flush_start_tx,
|
||||
@@ -2419,6 +2431,29 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
|
||||
self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
|
||||
let aux_metric =
|
||||
self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
|
||||
|
||||
let sum_of_entries = self
|
||||
.directory_metrics
|
||||
.iter()
|
||||
.map(|v| v.load(AtomicOrdering::Relaxed))
|
||||
.sum();
|
||||
// Set a high general threshold and a lower threshold for the auxiliary files,
|
||||
// as we can have large numbers of relations in the db directory.
|
||||
const SUM_THRESHOLD: u64 = 5000;
|
||||
const AUX_THRESHOLD: u64 = 1000;
|
||||
if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
|
||||
self.metrics
|
||||
.directory_entries_count_gauge
|
||||
.set(sum_of_entries);
|
||||
} else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
|
||||
metric.set(sum_of_entries);
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
|
||||
let guard = self.layers.read().await;
|
||||
for historic_layer in guard.layer_map().iter_historic_layers() {
|
||||
@@ -2439,45 +2474,28 @@ impl Timeline {
|
||||
/// should treat this as a cue to simply skip doing any heatmap uploading
|
||||
/// for this timeline.
|
||||
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
|
||||
let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
|
||||
// no point in heatmaps without remote client
|
||||
let _remote_client = self.remote_client.as_ref()?;
|
||||
|
||||
let remote_client = match &self.remote_client {
|
||||
Some(c) => c,
|
||||
None => return None,
|
||||
};
|
||||
if !self.is_active() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let layer_file_names = eviction_info
|
||||
.resident_layers
|
||||
.iter()
|
||||
.map(|l| l.layer.get_name())
|
||||
.collect::<Vec<_>>();
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
let decorated = match remote_client.get_layers_metadata(layer_file_names) {
|
||||
Ok(d) => d,
|
||||
Err(_) => {
|
||||
// Getting metadata only fails on Timeline in bad state.
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let resident = guard.resident_layers().map(|layer| {
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
let heatmap_layers = std::iter::zip(
|
||||
eviction_info.resident_layers.into_iter(),
|
||||
decorated.into_iter(),
|
||||
)
|
||||
.filter_map(|(layer, remote_info)| {
|
||||
remote_info.map(|remote_info| {
|
||||
HeatMapLayer::new(
|
||||
layer.layer.get_name(),
|
||||
IndexLayerMetadata::from(remote_info),
|
||||
layer.last_activity_ts,
|
||||
)
|
||||
})
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().filename(),
|
||||
layer.metadata().into(),
|
||||
last_activity_ts,
|
||||
)
|
||||
});
|
||||
|
||||
Some(HeatMapTimeline::new(
|
||||
self.timeline_id,
|
||||
heatmap_layers.collect(),
|
||||
))
|
||||
let layers = resident.collect().await;
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3634,7 +3652,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(img_key, &img).await?;
|
||||
image_layer_writer.put_image(img_key, img).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4968,41 +4986,24 @@ impl Timeline {
|
||||
/// Returns non-remote layers for eviction.
|
||||
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
let mut resident_layers = Vec::new();
|
||||
|
||||
for l in layers.iter_historic_layers() {
|
||||
let file_size = l.file_size();
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
let resident_layers = guard
|
||||
.resident_layers()
|
||||
.map(|layer| {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
|
||||
let l = guard.get_from_desc(&l);
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
let l = match l.keep_resident().await {
|
||||
Ok(Some(l)) => l,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
// these should not happen, but we cannot make them statically impossible right
|
||||
// now.
|
||||
tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}");
|
||||
continue;
|
||||
EvictionCandidate {
|
||||
layer: layer.into(),
|
||||
last_activity_ts,
|
||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||
}
|
||||
};
|
||||
|
||||
let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| {
|
||||
// We only use this fallback if there's an implementation error.
|
||||
// `latest_activity` already does rate-limited warn!() log.
|
||||
debug!(layer=%l, "last_activity returns None, using SystemTime::now");
|
||||
SystemTime::now()
|
||||
});
|
||||
|
||||
resident_layers.push(EvictionCandidate {
|
||||
layer: l.drop_eviction_guard().into(),
|
||||
last_activity_ts,
|
||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||
});
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
.await;
|
||||
|
||||
DiskUsageEvictionInfo {
|
||||
max_layer_size,
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
@@ -541,12 +541,7 @@ impl DeleteTimelineFlow {
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span =
|
||||
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
.instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -196,13 +196,13 @@ impl Timeline {
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
struct EvictionStats {
|
||||
candidates: usize,
|
||||
evicted: usize,
|
||||
errors: usize,
|
||||
not_evictable: usize,
|
||||
#[allow(dead_code)]
|
||||
skipped_for_shutdown: usize,
|
||||
}
|
||||
|
||||
@@ -239,12 +239,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
|
||||
// We only use this fallback if there's an implementation error.
|
||||
// `latest_activity` already does rate-limited warn!() log.
|
||||
debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
|
||||
SystemTime::now()
|
||||
});
|
||||
let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tracing::trace;
|
||||
@@ -20,19 +21,13 @@ use crate::{
|
||||
};
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager<Layer>,
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub(crate) fn create() -> Self {
|
||||
Self {
|
||||
layer_map: LayerMap::default(),
|
||||
layer_fmgr: LayerFileManager::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
}
|
||||
@@ -246,6 +241,32 @@ impl LayerManager {
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
|
||||
// for small layer maps, we most likely have all resident, but for larger more are likely
|
||||
// to be evicted assuming lots of layers correlated with longer lifespan.
|
||||
|
||||
let layers = self
|
||||
.layer_map()
|
||||
.iter_historic_layers()
|
||||
.map(|desc| self.get_from_desc(&desc));
|
||||
|
||||
let layers = futures::stream::iter(layers);
|
||||
|
||||
layers.filter_map(|layer| async move {
|
||||
// TODO(#6028): this query does not really need to see the ResidentLayer
|
||||
match layer.keep_resident().await {
|
||||
Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
|
||||
Ok(None) => None,
|
||||
Err(e) => {
|
||||
// these should not happen, but we cannot make them statically impossible right
|
||||
// now.
|
||||
tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
@@ -253,6 +274,12 @@ impl LayerManager {
|
||||
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
|
||||
impl<T> Default for LayerFileManager<T> {
|
||||
fn default() -> Self {
|
||||
Self(HashMap::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
@@ -275,10 +302,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
self.0.contains_key(&layer.layer_desc().key())
|
||||
}
|
||||
|
||||
pub(crate) fn new() -> Self {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
|
||||
pub(crate) fn remove(&mut self, layer: &T) {
|
||||
let present = self.0.remove(&layer.layer_desc().key());
|
||||
if present.is_none() && cfg!(debug_assertions) {
|
||||
|
||||
@@ -19,7 +19,7 @@ use once_cell::sync::OnceCell;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
|
||||
|
||||
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
|
||||
use std::os::unix::fs::FileExt;
|
||||
@@ -28,9 +28,10 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tokio::time::Instant;
|
||||
use utils::fs_ext;
|
||||
|
||||
mod io_engine;
|
||||
pub use pageserver_api::models::virtual_file as api;
|
||||
pub(crate) mod io_engine;
|
||||
mod open_options;
|
||||
pub use io_engine::IoEngineKind;
|
||||
pub(crate) use io_engine::IoEngineKind;
|
||||
pub(crate) use open_options::*;
|
||||
|
||||
///
|
||||
@@ -409,10 +410,10 @@ impl VirtualFile {
|
||||
/// step, the tmp path is renamed to the final path. As renames are
|
||||
/// atomic, a crash during the write operation will never leave behind a
|
||||
/// partially written file.
|
||||
pub async fn crashsafe_overwrite(
|
||||
pub async fn crashsafe_overwrite<B: BoundedBuf>(
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
content: &[u8],
|
||||
content: B,
|
||||
) -> std::io::Result<()> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(std::io::Error::from_raw_os_error(
|
||||
@@ -429,7 +430,8 @@ impl VirtualFile {
|
||||
.create_new(true),
|
||||
)
|
||||
.await?;
|
||||
file.write_all(content).await?;
|
||||
let (_content, res) = file.write_all(content).await;
|
||||
res?;
|
||||
file.sync_all().await?;
|
||||
drop(file); // before the rename, that's important!
|
||||
// renames are atomic
|
||||
@@ -580,43 +582,69 @@ impl VirtualFile {
|
||||
}
|
||||
|
||||
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
|
||||
pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
|
||||
pub async fn write_all_at<B: BoundedBuf>(
|
||||
&self,
|
||||
buf: B,
|
||||
mut offset: u64,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return (Slice::into_inner(buf.slice_full()), Ok(()));
|
||||
}
|
||||
let mut buf = buf.slice(0..buf_len);
|
||||
while !buf.is_empty() {
|
||||
match self.write_at(buf, offset).await {
|
||||
// TODO: push `buf` further down
|
||||
match self.write_at(&buf, offset).await {
|
||||
Ok(0) => {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
));
|
||||
return (
|
||||
Slice::into_inner(buf),
|
||||
Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
)),
|
||||
);
|
||||
}
|
||||
Ok(n) => {
|
||||
buf = &buf[n..];
|
||||
buf = buf.slice(n..);
|
||||
offset += n as u64;
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
Err(e) => return (Slice::into_inner(buf), Err(e)),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
(Slice::into_inner(buf), Ok(()))
|
||||
}
|
||||
|
||||
pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
|
||||
/// Writes `buf.slice(0..buf.bytes_init())`.
|
||||
/// Returns the IoBuf that is underlying the BoundedBuf `buf`.
|
||||
/// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
|
||||
/// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
|
||||
pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
return (Slice::into_inner(buf.slice_full()), Ok(0));
|
||||
}
|
||||
let mut buf = buf.slice(0..nbytes);
|
||||
while !buf.is_empty() {
|
||||
match self.write(buf).await {
|
||||
// TODO: push `Slice` further down
|
||||
match self.write(&buf).await {
|
||||
Ok(0) => {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
));
|
||||
return (
|
||||
Slice::into_inner(buf),
|
||||
Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
)),
|
||||
);
|
||||
}
|
||||
Ok(n) => {
|
||||
buf = &buf[n..];
|
||||
buf = buf.slice(n..);
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
Err(e) => return (Slice::into_inner(buf), Err(e)),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
(Slice::into_inner(buf), Ok(nbytes))
|
||||
}
|
||||
|
||||
async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||
@@ -675,7 +703,6 @@ where
|
||||
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
|
||||
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
|
||||
{
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
|
||||
while buf.bytes_total() != 0 {
|
||||
let res;
|
||||
@@ -1050,10 +1077,19 @@ mod tests {
|
||||
MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
|
||||
}
|
||||
}
|
||||
async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
|
||||
async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
|
||||
MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
|
||||
MaybeVirtualFile::VirtualFile(file) => {
|
||||
let (_buf, res) = file.write_all_at(buf, offset).await;
|
||||
res
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
file.write_all_at(&buf.slice(0..buf_len), offset)
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
@@ -1062,10 +1098,19 @@ mod tests {
|
||||
MaybeVirtualFile::File(file) => file.seek(pos),
|
||||
}
|
||||
}
|
||||
async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
|
||||
async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
|
||||
MaybeVirtualFile::File(file) => file.write_all(buf),
|
||||
MaybeVirtualFile::VirtualFile(file) => {
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res.map(|_| ())
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
file.write_all(&buf.slice(0..buf_len))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1140,7 +1185,7 @@ mod tests {
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
file_a.write_all(b"foobar").await?;
|
||||
file_a.write_all(b"foobar".to_vec()).await?;
|
||||
|
||||
// cannot read from a file opened in write-only mode
|
||||
let _ = file_a.read_string().await.unwrap_err();
|
||||
@@ -1149,7 +1194,7 @@ mod tests {
|
||||
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
let _ = file_a.write_all(b"bar").await.unwrap_err();
|
||||
let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
|
||||
|
||||
// Try simple read
|
||||
assert_eq!("foobar", file_a.read_string().await?);
|
||||
@@ -1191,8 +1236,8 @@ mod tests {
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
file_b.write_all_at(b"BAR", 3).await?;
|
||||
file_b.write_all_at(b"FOO", 0).await?;
|
||||
file_b.write_all_at(b"BAR".to_vec(), 3).await?;
|
||||
file_b.write_all_at(b"FOO".to_vec(), 0).await?;
|
||||
|
||||
assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
|
||||
|
||||
@@ -1292,7 +1337,7 @@ mod tests {
|
||||
let path = testdir.join("myfile");
|
||||
let tmp_path = testdir.join("myfile.tmp");
|
||||
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
||||
@@ -1301,7 +1346,7 @@ mod tests {
|
||||
assert!(!tmp_path.exists());
|
||||
drop(file);
|
||||
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
||||
@@ -1323,7 +1368,7 @@ mod tests {
|
||||
std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
|
||||
assert!(tmp_path.exists());
|
||||
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -7,67 +7,100 @@
|
||||
//!
|
||||
//! Then use [`get`] and [`super::OpenOptions`].
|
||||
|
||||
#[derive(
|
||||
Copy,
|
||||
Clone,
|
||||
PartialEq,
|
||||
Eq,
|
||||
Hash,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
Debug,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum IoEngineKind {
|
||||
pub(crate) use super::api::IoEngineKind;
|
||||
#[derive(Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
pub(crate) enum IoEngine {
|
||||
NotSet,
|
||||
StdFs,
|
||||
#[cfg(target_os = "linux")]
|
||||
TokioEpollUring,
|
||||
}
|
||||
|
||||
static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(super) fn init(engine: IoEngineKind) {
|
||||
if IO_ENGINE.set(engine).is_err() {
|
||||
panic!("called twice");
|
||||
impl From<IoEngineKind> for IoEngine {
|
||||
fn from(value: IoEngineKind) -> Self {
|
||||
match value {
|
||||
IoEngineKind::StdFs => IoEngine::StdFs,
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring,
|
||||
}
|
||||
}
|
||||
crate::metrics::virtual_file_io_engine::KIND
|
||||
.with_label_values(&[&format!("{engine}")])
|
||||
.set(1);
|
||||
}
|
||||
|
||||
pub(super) fn get() -> &'static IoEngineKind {
|
||||
#[cfg(test)]
|
||||
{
|
||||
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
|
||||
IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
|
||||
Ok(v) => match v.parse::<IoEngineKind>() {
|
||||
Ok(engine_kind) => engine_kind,
|
||||
Err(e) => {
|
||||
panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
|
||||
}
|
||||
},
|
||||
Err(std::env::VarError::NotPresent) => {
|
||||
crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
Err(std::env::VarError::NotUnicode(_)) => {
|
||||
panic!("env var {env_var_name} is not unicode");
|
||||
}
|
||||
impl TryFrom<u8> for IoEngine {
|
||||
type Error = u8;
|
||||
|
||||
fn try_from(value: u8) -> Result<Self, Self::Error> {
|
||||
Ok(match value {
|
||||
v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet,
|
||||
v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs,
|
||||
#[cfg(target_os = "linux")]
|
||||
v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring,
|
||||
x => return Err(x),
|
||||
})
|
||||
}
|
||||
#[cfg(not(test))]
|
||||
IO_ENGINE.get().unwrap()
|
||||
}
|
||||
|
||||
use std::os::unix::prelude::FileExt;
|
||||
static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
|
||||
|
||||
pub(crate) fn set(engine_kind: IoEngineKind) {
|
||||
let engine: IoEngine = engine_kind.into();
|
||||
IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
|
||||
#[cfg(not(test))]
|
||||
{
|
||||
let metric = &crate::metrics::virtual_file_io_engine::KIND;
|
||||
metric.reset();
|
||||
metric
|
||||
.with_label_values(&[&format!("{engine_kind}")])
|
||||
.set(1);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(super) fn init(engine_kind: IoEngineKind) {
|
||||
set(engine_kind);
|
||||
}
|
||||
|
||||
pub(super) fn get() -> IoEngine {
|
||||
let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
|
||||
if cfg!(test) {
|
||||
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
|
||||
match cur {
|
||||
IoEngine::NotSet => {
|
||||
let kind = match std::env::var(env_var_name) {
|
||||
Ok(v) => match v.parse::<IoEngineKind>() {
|
||||
Ok(engine_kind) => engine_kind,
|
||||
Err(e) => {
|
||||
panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
|
||||
}
|
||||
},
|
||||
Err(std::env::VarError::NotPresent) => {
|
||||
crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
Err(std::env::VarError::NotUnicode(_)) => {
|
||||
panic!("env var {env_var_name} is not unicode");
|
||||
}
|
||||
};
|
||||
self::set(kind);
|
||||
self::get()
|
||||
}
|
||||
x => x,
|
||||
}
|
||||
} else {
|
||||
cur
|
||||
}
|
||||
}
|
||||
|
||||
use std::{
|
||||
os::unix::prelude::FileExt,
|
||||
sync::atomic::{AtomicU8, Ordering},
|
||||
};
|
||||
|
||||
use super::FileGuard;
|
||||
|
||||
impl IoEngineKind {
|
||||
impl IoEngine {
|
||||
pub(super) async fn read_at<B>(
|
||||
&self,
|
||||
file_guard: FileGuard,
|
||||
@@ -78,7 +111,8 @@ impl IoEngineKind {
|
||||
B: tokio_epoll_uring::BoundedBufMut + Send,
|
||||
{
|
||||
match self {
|
||||
IoEngineKind::StdFs => {
|
||||
IoEngine::NotSet => panic!("not initialized"),
|
||||
IoEngine::StdFs => {
|
||||
// SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
|
||||
let dst = unsafe {
|
||||
std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
|
||||
@@ -96,7 +130,7 @@ impl IoEngineKind {
|
||||
((file_guard, buf), res)
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngineKind::TokioEpollUring => {
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let (resources, res) = system.read(file_guard, offset, buf).await;
|
||||
(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
|
||||
|
||||
use super::IoEngineKind;
|
||||
use super::io_engine::IoEngine;
|
||||
use std::{os::fd::OwnedFd, path::Path};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -13,9 +13,10 @@ pub enum OpenOptions {
|
||||
impl Default for OpenOptions {
|
||||
fn default() -> Self {
|
||||
match super::io_engine::get() {
|
||||
IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
|
||||
IoEngine::NotSet => panic!("io engine not set"),
|
||||
IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngineKind::TokioEpollUring => {
|
||||
IoEngine::TokioEpollUring => {
|
||||
Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,7 +346,7 @@ impl WalIngest {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
|
||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||
let xlrec = XlLogicalMessage::decode(&mut buf);
|
||||
let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf);
|
||||
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
||||
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
|
||||
if prefix == "neon-test" {
|
||||
@@ -1695,22 +1695,22 @@ mod tests {
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x50));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
|
||||
@@ -1751,46 +1751,46 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 2")
|
||||
test_img("foo blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
test_img("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
test_img("foo blk 1 at 4")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
test_img("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
@@ -1812,13 +1812,13 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
test_img("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
@@ -1832,7 +1832,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
test_img("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
@@ -1851,7 +1851,7 @@ mod tests {
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
let mut m = tline.begin_modification(Lsn(0x70));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -1870,13 +1870,13 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1")
|
||||
test_img("foo blk 1")
|
||||
);
|
||||
|
||||
// Extend a lot more, leaving a big gap that spans across segments
|
||||
let mut m = tline.begin_modification(Lsn(0x80));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -1897,7 +1897,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1500")
|
||||
test_img("foo blk 1500")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1915,7 +1915,7 @@ mod tests {
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
|
||||
@@ -1952,7 +1952,7 @@ mod tests {
|
||||
// Re-create it
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
|
||||
@@ -1990,7 +1990,7 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit(&ctx).await?;
|
||||
@@ -2028,7 +2028,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2055,7 +2055,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2073,7 +2073,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2084,7 +2084,7 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit(&ctx).await?;
|
||||
@@ -2109,7 +2109,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2130,7 +2130,7 @@ mod tests {
|
||||
for blknum in 0..RELSEG_SIZE + 1 {
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
|
||||
.await?;
|
||||
|
||||
@@ -44,6 +44,11 @@ pub enum NeonWalRecord {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
/// Update the map of AUX files, either writing or dropping an entry
|
||||
AuxFile {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
}
|
||||
|
||||
impl NeonWalRecord {
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
mod process;
|
||||
|
||||
/// Code to apply [`NeonWalRecord`]s.
|
||||
mod apply_neon;
|
||||
pub(crate) mod apply_neon;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::{
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use crate::pgdatadir_mapping::AuxFilesDirectory;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -12,6 +13,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
|
||||
};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
/// or we need to pass it to wal-redo postgres process?
|
||||
@@ -230,6 +232,72 @@ pub(crate) fn apply_in_neon(
|
||||
LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
|
||||
}
|
||||
}
|
||||
NeonWalRecord::AuxFile { file_path, content } => {
|
||||
let mut dir = AuxFilesDirectory::des(page)?;
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
|
||||
page.clear();
|
||||
let mut writer = page.writer();
|
||||
dir.ser_into(&mut writer)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::AUX_FILES_KEY;
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
|
||||
|
||||
/// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
|
||||
#[test]
|
||||
fn apply_aux_file_deltas() -> anyhow::Result<()> {
|
||||
let base_dir = AuxFilesDirectory {
|
||||
files: HashMap::from([
|
||||
("two".to_string(), Bytes::from_static(b"content0")),
|
||||
("three".to_string(), Bytes::from_static(b"contentX")),
|
||||
]),
|
||||
};
|
||||
let base_image = AuxFilesDirectory::ser(&base_dir)?;
|
||||
|
||||
let deltas = vec![
|
||||
// Insert
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "one".to_string(),
|
||||
content: Some(Bytes::from_static(b"content1")),
|
||||
},
|
||||
// Update
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "two".to_string(),
|
||||
content: Some(Bytes::from_static(b"content99")),
|
||||
},
|
||||
// Delete
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "three".to_string(),
|
||||
content: None,
|
||||
},
|
||||
];
|
||||
|
||||
let file_path = AUX_FILES_KEY;
|
||||
let mut page = BytesMut::from_iter(base_image);
|
||||
|
||||
for record in deltas {
|
||||
apply_in_neon(&record, file_path, &mut page)?;
|
||||
}
|
||||
|
||||
let reconstructed = AuxFilesDirectory::des(&page)?;
|
||||
let expect = HashMap::from([
|
||||
("one".to_string(), Bytes::from_static(b"content1")),
|
||||
("two".to_string(), Bytes::from_static(b"content99")),
|
||||
]);
|
||||
|
||||
assert_eq!(reconstructed.files, expect);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user