Merge remote-tracking branch 'origin/main' into problame/integrate-tokio-epoll-uring/benchmarking/2024-01-31-prs/async-walredo

This commit is contained in:
Christian Schwarz
2024-04-03 21:11:03 +02:00
182 changed files with 7330 additions and 2410 deletions

View File

@@ -600,32 +600,37 @@ fn start_pageserver(
None,
"consumption metrics collection",
true,
async move {
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
let cancel = task_mgr::shutdown_token();
{
let tenant_manager = tenant_manager.clone();
async move {
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => {}
};
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => {}
};
pageserver::consumption_metrics::collect_metrics(
metric_collection_endpoint,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
cancel,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
pageserver::consumption_metrics::collect_metrics(
tenant_manager,
metric_collection_endpoint,
&conf.metric_collection_bucket,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
cancel,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
}
},
);
}

View File

@@ -94,6 +94,8 @@ pub mod defaults {
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = crate::walredo::ProcessKind::DEFAULT_TOML;
///
@@ -158,6 +160,8 @@ pub mod defaults {
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
[remote_storage]
"#
@@ -234,6 +238,7 @@ pub struct PageServerConf {
// How often to send unchanged cached metrics to the metrics endpoint.
pub cached_metric_collection_interval: Duration,
pub metric_collection_endpoint: Option<Url>,
pub metric_collection_bucket: Option<RemoteStorageConfig>,
pub synthetic_size_calculation_interval: Duration,
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -279,6 +284,13 @@ pub struct PageServerConf {
pub validate_vectored_get: bool,
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
/// of ephemeral data.
///
/// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize,
pub walredo_process_kind: crate::walredo::ProcessKind,
}
@@ -374,6 +386,7 @@ struct PageServerConfigBuilder {
cached_metric_collection_interval: BuilderValue<Duration>,
metric_collection_endpoint: BuilderValue<Option<Url>>,
synthetic_size_calculation_interval: BuilderValue<Duration>,
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
@@ -400,6 +413,8 @@ struct PageServerConfigBuilder {
validate_vectored_get: BuilderValue<bool>,
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
}
@@ -456,6 +471,8 @@ impl PageServerConfigBuilder {
.expect("cannot parse default synthetic size calculation interval")),
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
metric_collection_bucket: Set(None),
disk_usage_based_eviction: Set(None),
test_remote_failures: Set(0),
@@ -483,6 +500,7 @@ impl PageServerConfigBuilder {
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
}
@@ -585,6 +603,13 @@ impl PageServerConfigBuilder {
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
}
pub fn metric_collection_bucket(
&mut self,
metric_collection_bucket: Option<RemoteStorageConfig>,
) {
self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
}
pub fn synthetic_size_calculation_interval(
&mut self,
synthetic_size_calculation_interval: Duration,
@@ -653,6 +678,10 @@ impl PageServerConfigBuilder {
self.validate_vectored_get = BuilderValue::Set(value);
}
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
}
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
self.walredo_process_kind = BuilderValue::Set(value);
}
@@ -696,6 +725,7 @@ impl PageServerConfigBuilder {
metric_collection_interval,
cached_metric_collection_interval,
metric_collection_endpoint,
metric_collection_bucket,
synthetic_size_calculation_interval,
disk_usage_based_eviction,
test_remote_failures,
@@ -710,6 +740,7 @@ impl PageServerConfigBuilder {
get_vectored_impl,
max_vectored_read_bytes,
validate_vectored_get,
ephemeral_bytes_per_memory_kb,
walredo_process_kind,
}
CUSTOM LOGIC
@@ -944,6 +975,9 @@ impl PageServerConf {
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
builder.metric_collection_endpoint(Some(endpoint));
},
"metric_collection_bucket" => {
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
}
"synthetic_size_calculation_interval" =>
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -997,6 +1031,9 @@ impl PageServerConf {
"validate_vectored_get" => {
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
}
"ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
}
"walredo_process_kind" => {
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
}
@@ -1061,6 +1098,7 @@ impl PageServerConf {
metric_collection_interval: Duration::from_secs(60),
cached_metric_collection_interval: Duration::from_secs(60 * 60),
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(60),
disk_usage_based_eviction: None,
test_remote_failures: 0,
@@ -1079,6 +1117,7 @@ impl PageServerConf {
.expect("Invalid default constant"),
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
}
}
@@ -1292,6 +1331,7 @@ background_task_maximum_delay = '334 s'
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
)?,
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: humantime::parse_duration(
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
)?,
@@ -1314,6 +1354,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant")
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
},
"Correct defaults should be used when no config values are provided"
@@ -1366,6 +1407,7 @@ background_task_maximum_delay = '334 s'
metric_collection_interval: Duration::from_secs(222),
cached_metric_collection_interval: Duration::from_secs(22200),
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(333),
disk_usage_based_eviction: None,
test_remote_failures: 0,
@@ -1384,6 +1426,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant")
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
},
"Should be able to parse all basic config values correctly"

View File

@@ -3,10 +3,13 @@
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
use crate::tenant::{
mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
use reqwest::Url;
use std::collections::HashMap;
use std::sync::Arc;
@@ -40,7 +43,9 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
/// Main thread that serves metrics collection
#[allow(clippy::too_many_arguments)]
pub async fn collect_metrics(
tenant_manager: Arc<TenantManager>,
metric_collection_endpoint: &Url,
metric_collection_bucket: &Option<RemoteStorageConfig>,
metric_collection_interval: Duration,
_cached_metric_collection_interval: Duration,
synthetic_size_calculation_interval: Duration,
@@ -65,15 +70,19 @@ pub async fn collect_metrics(
None,
"synthetic size calculation",
false,
async move {
calculate_synthetic_size_worker(
synthetic_size_calculation_interval,
&cancel,
&worker_ctx,
)
.instrument(info_span!("synthetic_size_worker"))
.await?;
Ok(())
{
let tenant_manager = tenant_manager.clone();
async move {
calculate_synthetic_size_worker(
tenant_manager,
synthetic_size_calculation_interval,
&cancel,
&worker_ctx,
)
.instrument(info_span!("synthetic_size_worker"))
.await?;
Ok(())
}
},
);
@@ -94,13 +103,27 @@ pub async fn collect_metrics(
.build()
.expect("Failed to create http client with timeout");
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
match GenericRemoteStorage::from_config(bucket_config) {
Ok(client) => Some(client),
Err(e) => {
// Non-fatal error: if we were given an invalid config, we will proceed
// with sending metrics over the network, but not to S3.
tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
None
}
}
} else {
None
};
let node_id = node_id.to_string();
loop {
let started_at = Instant::now();
// these are point in time, with variable "now"
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
let metrics = Arc::new(metrics);
@@ -118,10 +141,18 @@ pub async fn collect_metrics(
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
}
}
if let Some(bucket_client) = &bucket_client {
let res =
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
if let Err(e) = res {
tracing::error!("failed to upload to S3: {e:#}");
}
}
};
let upload = async {
let res = upload::upload_metrics(
let res = upload::upload_metrics_http(
&client,
metric_collection_endpoint,
&cancel,
@@ -132,7 +163,7 @@ pub async fn collect_metrics(
.await;
if let Err(e) = res {
// serialization error which should never happen
tracing::error!("failed to upload due to {e:#}");
tracing::error!("failed to upload via HTTP due to {e:#}");
}
};
@@ -247,6 +278,7 @@ async fn reschedule(
/// Caclculate synthetic size for each active tenant
async fn calculate_synthetic_size_worker(
tenant_manager: Arc<TenantManager>,
synthetic_size_calculation_interval: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
@@ -259,7 +291,7 @@ async fn calculate_synthetic_size_worker(
loop {
let started_at = Instant::now();
let tenants = match mgr::list_tenants().await {
let tenants = match tenant_manager.list_tenants() {
Ok(tenants) => tenants,
Err(e) => {
warn!("cannot get tenant list: {e:#}");
@@ -278,10 +310,14 @@ async fn calculate_synthetic_size_worker(
continue;
}
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
continue;
};
if !tenant.is_active() {
continue;
}
// there is never any reason to exit calculate_synthetic_size_worker following any
// return value -- we don't need to care about shutdown because no tenant is found when
// pageserver is shut down.
@@ -319,9 +355,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
};
// this error can be returned if timeline is shutting down, but it does not
// mean the synthetic size worker should terminate. we do not need any checks
// in this function because `mgr::get_tenant` will error out after shutdown has
// progressed to shutting down tenants.
// mean the synthetic size worker should terminate.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))

View File

@@ -1,3 +1,4 @@
use crate::tenant::mgr::TenantManager;
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
@@ -181,6 +182,7 @@ impl MetricsKey {
}
pub(super) async fn collect_all_metrics(
tenant_manager: &Arc<TenantManager>,
cached_metrics: &Cache,
ctx: &RequestContext,
) -> Vec<RawMetric> {
@@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics(
let started_at = std::time::Instant::now();
let tenants = match crate::tenant::mgr::list_tenants().await {
let tenants = match tenant_manager.list_tenants() {
Ok(tenants) => tenants,
Err(err) => {
tracing::error!("failed to list tenants: {:?}", err);
@@ -200,7 +202,8 @@ pub(super) async fn collect_all_metrics(
if state != TenantState::Active || !id.is_zero() {
None
} else {
crate::tenant::mgr::get_tenant(id, true)
tenant_manager
.get_attached_tenant_shard(id)
.ok()
.map(|tenant| (id.tenant_id, tenant))
}

View File

@@ -1,4 +1,9 @@
use std::time::SystemTime;
use chrono::{DateTime, Utc};
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
use remote_storage::{GenericRemoteStorage, RemotePath};
use tokio::io::AsyncWriteExt;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
@@ -13,8 +18,9 @@ struct Ids {
pub(super) timeline_id: Option<TimelineId>,
}
/// Serialize and write metrics to an HTTP endpoint
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
pub(super) async fn upload_metrics(
pub(super) async fn upload_metrics_http(
client: &reqwest::Client,
metric_collection_endpoint: &reqwest::Url,
cancel: &CancellationToken,
@@ -74,6 +80,60 @@ pub(super) async fn upload_metrics(
Ok(())
}
/// Serialize and write metrics to a remote storage object
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
pub(super) async fn upload_metrics_bucket(
client: &GenericRemoteStorage,
cancel: &CancellationToken,
node_id: &str,
metrics: &[RawMetric],
) -> anyhow::Result<()> {
if metrics.is_empty() {
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
// of an empty object.
return Ok(());
}
// Compose object path
let datetime: DateTime<Utc> = SystemTime::now().into();
let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
// Set up a gzip writer into a buffer
let mut compressed_bytes: Vec<u8> = Vec::new();
let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
// Serialize and write into compressed buffer
let started_at = std::time::Instant::now();
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
let (_chunk, body) = res?;
gzip_writer.write_all(&body).await?;
}
gzip_writer.flush().await?;
gzip_writer.shutdown().await?;
let compressed_length = compressed_bytes.len();
// Write to remote storage
client
.upload_storage_object(
futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
compressed_length,
&path,
cancel,
)
.await?;
let elapsed = started_at.elapsed();
tracing::info!(
compressed_length,
elapsed_ms = elapsed.as_millis(),
"write metrics bucket at {path}",
);
Ok(())
}
// The return type is quite ugly, but we gain testability in isolation
fn serialize_in_chunks<'a, F>(
chunk_size: usize,

View File

@@ -61,7 +61,6 @@ use crate::{
metrics::disk_usage_based_eviction::METRICS,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
self,
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
@@ -814,8 +813,8 @@ async fn collect_eviction_candidates(
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
// get a snapshot of the list of tenants
let tenants = tenant::mgr::list_tenants()
.await
let tenants = tenant_manager
.list_tenants()
.context("get list of tenants")?;
// TODO: avoid listing every layer in every tenant: this loop can block the executor,
@@ -827,8 +826,12 @@ async fn collect_eviction_candidates(
if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled);
}
let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
Ok(tenant) => tenant,
let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
Ok(tenant) if tenant.is_active() => tenant,
Ok(_) => {
debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
continue;
}
Err(e) => {
// this can happen if tenant has lifecycle transition after we fetched it
debug!("failed to get tenant: {e:#}");

View File

@@ -1038,7 +1038,7 @@ paths:
format: hex
responses:
"201":
description: TimelineInfo
description: Timeline was created, or already existed with matching parameters
content:
application/json:
schema:
@@ -1068,11 +1068,17 @@ paths:
schema:
$ref: "#/components/schemas/Error"
"409":
description: Timeline already exists, creation skipped
description: Timeline already exists, with different parameters. Creation cannot proceed.
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"429":
description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"500":
description: Generic operation error
content:

View File

@@ -49,8 +49,8 @@ use crate::task_mgr::TaskKind;
use crate::tenant::config::{LocationConf, TenantConfOpt};
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
TenantSlotUpsertError, TenantStateError,
};
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
use crate::tenant::remote_timeline_client;
@@ -249,16 +249,11 @@ impl From<GetTenantError> for ApiError {
fn from(tse: GetTenantError) -> ApiError {
match tse {
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
GetTenantError::Broken(reason) => {
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
}
GetTenantError::NotActive(_) => {
// Why is this not `ApiError::NotFound`?
// Because we must be careful to never return 404 for a tenant if it does
// in fact exist locally. If we did, the caller could draw the conclusion
// that it can attach the tenant to another PS and we'd be in split-brain.
//
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::ResourceUnavailable("Tenant not yet active".into())
}
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
@@ -269,6 +264,9 @@ impl From<GetTenantError> for ApiError {
impl From<GetActiveTenantError> for ApiError {
fn from(e: GetActiveTenantError) -> ApiError {
match e {
GetActiveTenantError::Broken(reason) => {
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
}
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -279,19 +277,6 @@ impl From<GetActiveTenantError> for ApiError {
}
}
impl From<SetNewTenantConfigError> for ApiError {
fn from(e: SetNewTenantConfigError) -> ApiError {
match e {
SetNewTenantConfigError::GetTenant(tid) => {
ApiError::NotFound(anyhow!("tenant {}", tid).into())
}
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
}
}
impl From<crate::tenant::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
use crate::tenant::DeleteTimelineError::*;
@@ -495,7 +480,7 @@ async fn timeline_create_handler(
async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, false)?;
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -535,10 +520,13 @@ async fn timeline_create_handler(
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
)
}
Err(
e @ tenant::CreateTimelineError::Conflict
| e @ tenant::CreateTimelineError::AlreadyCreating,
) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
Err(e @ tenant::CreateTimelineError::Conflict) => {
json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
}
Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
StatusCode::TOO_MANY_REQUESTS,
HttpErrorBody::from_msg(e.to_string()),
),
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
StatusCode::NOT_ACCEPTABLE,
HttpErrorBody::from_msg(format!("{err:#}")),
@@ -581,7 +569,7 @@ async fn timeline_list_handler(
let response_data = async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, false)?;
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -619,6 +607,7 @@ async fn timeline_preserve_initdb_handler(
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
// Part of the process for disaster recovery from safekeeper-stored WAL:
// If we don't recover into a new timeline but want to keep the timeline ID,
@@ -626,7 +615,9 @@ async fn timeline_preserve_initdb_handler(
// location where timeline recreation cand find it.
async {
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let timeline = tenant
.get_timeline(timeline_id, false)
@@ -668,7 +659,7 @@ async fn timeline_detail_handler(
let timeline_info = async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, false)?;
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -855,7 +846,7 @@ async fn timeline_delete_handler(
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, false)
.get_attached_tenant_shard(tenant_shard_id)
.map_err(|e| {
match e {
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
@@ -973,10 +964,11 @@ async fn tenant_list_handler(
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let state = get_state(&request);
let response_data = mgr::list_tenants()
.instrument(info_span!("tenant_list"))
.await
let response_data = state
.tenant_manager
.list_tenants()
.map_err(|_| {
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
})?
@@ -999,9 +991,27 @@ async fn tenant_status(
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
// In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
let activate = true;
#[cfg(feature = "testing")]
let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
let tenant_info = async {
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
if activate {
// This is advisory: we prefer to let the tenant activate on-demand when this function is
// called, but it is still valid to return 200 and describe the current state of the tenant
// if it doesn't make it into an active state.
tenant
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
.await
.ok();
}
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
@@ -1074,9 +1084,7 @@ async fn tenant_size_handler(
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
let headers = request.headers();
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let state = get_state(&request);
if !tenant_shard_id.is_zero() {
return Err(ApiError::BadRequest(anyhow!(
@@ -1084,6 +1092,12 @@ async fn tenant_size_handler(
)));
}
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
// this can be long operation
let inputs = tenant
.gather_size_inputs(
@@ -1152,10 +1166,15 @@ async fn tenant_shard_split_handler(
let state = get_state(&request);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let new_shards = state
.tenant_manager
.shard_split(
tenant_shard_id,
tenant,
ShardCount::new(req.new_shard_count),
req.new_stripe_size,
&ctx,
@@ -1373,8 +1392,11 @@ async fn get_tenant_config_handler(
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let response = HashMap::from([
(
@@ -1402,15 +1424,31 @@ async fn update_tenant_config_handler(
let tenant_id = request_data.tenant_id;
check_permission(&request, Some(tenant_id))?;
let tenant_conf =
let new_tenant_conf =
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
let state = get_state(&request);
state
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let tenant = state
.tenant_manager
.set_new_tenant_config(tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", %tenant_id))
.await?;
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(
new_tenant_conf.clone(),
tenant.get_generation(),
&ShardParameters::default(),
);
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(ApiError::InternalServerError)?;
tenant.set_new_tenant_config(new_tenant_conf);
json_response(StatusCode::OK, ())
}
@@ -1634,10 +1672,12 @@ async fn handle_tenant_break(
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
tenant.set_broken("broken from test".to_owned()).await;
let state = get_state(&r);
state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?
.set_broken("broken from test".to_owned())
.await;
json_response(StatusCode::OK, ())
}
@@ -1881,7 +1921,7 @@ async fn active_timeline_of_active_tenant(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>, ApiError> {
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

View File

@@ -435,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_remote_physical_size",
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
"The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
// Corollary: If any files are missing from the index part, they won't be included here.
&["tenant_id", "shard_id", "timeline_id"]
)
@@ -699,6 +699,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
.expect("Failed to register pageserver_startup_is_loading")
});
pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_timeline_ephemeral_bytes",
"Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated."
)
.expect("Failed to register metric")
});
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
/// like how long it took to load.
///
@@ -1475,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
});
pub(crate) struct WalIngestMetrics {
pub(crate) bytes_received: IntCounter,
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
"Bytes of WAL ingested from safekeepers",
)
.unwrap(),
records_received: register_int_counter!(
"pageserver_wal_ingest_records_received",
"Number of WAL records received from safekeepers"

View File

@@ -760,6 +760,7 @@ impl PageServerHandler {
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
timeline
.import_basebackup_from_tar(
tenant.clone(),
&mut copyin_reader,
base_lsn,
self.broker_client.clone(),
@@ -875,7 +876,13 @@ impl PageServerHandler {
if lsn <= last_record_lsn {
lsn = last_record_lsn;
} else {
timeline.wait_lsn(lsn, ctx).await?;
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
// Since we waited for 'lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the
// last-record LSN can advance immediately after we return
@@ -887,7 +894,13 @@ impl PageServerHandler {
"invalid LSN(0) in request".into(),
));
}
timeline.wait_lsn(lsn, ctx).await?;
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
}
if lsn < **latest_gc_cutoff_lsn {
@@ -1214,7 +1227,13 @@ impl PageServerHandler {
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
info!("waiting for {}", lsn);
timeline.wait_lsn(lsn, ctx).await?;
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
timeline
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
.context("invalid basebackup lsn")?;

View File

@@ -214,13 +214,12 @@ pub enum TaskKind {
/// Internally, `Client` hands over requests to the `Connection` object.
/// The `Connection` object is responsible for speaking the wire protocol.
///
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
/// That abstraction doesn't use `task_mgr`.
/// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
///
/// Once the connection is established, the `TaskHandle` task creates a
/// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
/// Once the connection is established, the `TaskHandle` task spawns a
/// [`WalReceiverConnectionPoller`] task that is responsible for polling
/// the `Connection` object.
/// A `CancellationToken` created by the `TaskHandle` task ensures
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -230,7 +229,6 @@ pub enum TaskKind {
WalReceiverManager,
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
/// See the comment on [`WalReceiverManager`].
///
/// [`WalReceiverManager`]: Self::WalReceiverManager

View File

@@ -12,6 +12,7 @@
//!
use anyhow::{bail, Context};
use arc_swap::ArcSwap;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use enumset::EnumSet;
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::{Mutex, RwLock};
use std::sync::Mutex;
use std::time::{Duration, Instant};
use crate::span;
@@ -260,7 +261,7 @@ pub struct Tenant {
// We keep TenantConfOpt sturct here to preserve the information
// about parameters that are not set.
// This is necessary to allow global config updates.
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
tenant_shard_id: TenantShardId,
@@ -1411,7 +1412,7 @@ impl Tenant {
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
#[allow(clippy::too_many_arguments)]
pub(crate) async fn create_timeline(
&self,
self: &Arc<Tenant>,
new_timeline_id: TimelineId,
ancestor_timeline_id: Option<TimelineId>,
mut ancestor_start_lsn: Option<Lsn>,
@@ -1515,7 +1516,7 @@ impl Tenant {
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline
.wait_lsn(*lsn, ctx)
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
.await
.map_err(|e| match e {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1559,7 +1560,7 @@ impl Tenant {
})?;
}
loaded_timeline.activate(broker_client, None, ctx);
loaded_timeline.activate(self.clone(), broker_client, None, ctx);
Ok(loaded_timeline)
}
@@ -1606,7 +1607,7 @@ impl Tenant {
);
{
let conf = self.tenant_conf.read().unwrap();
let conf = self.tenant_conf.load();
if !conf.location.may_delete_layers_hint() {
info!("Skipping GC in location state {:?}", conf.location);
@@ -1633,7 +1634,7 @@ impl Tenant {
}
{
let conf = self.tenant_conf.read().unwrap();
let conf = self.tenant_conf.load();
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
info!("Skipping compaction in location state {:?}", conf.location);
return Ok(());
@@ -1731,7 +1732,12 @@ impl Tenant {
let mut activated_timelines = 0;
for timeline in timelines_to_activate {
timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
timeline.activate(
self.clone(),
broker_client.clone(),
background_jobs_can_start,
ctx,
);
activated_timelines += 1;
}
@@ -1777,7 +1783,7 @@ impl Tenant {
async fn shutdown(
&self,
shutdown_progress: completion::Barrier,
freeze_and_flush: bool,
shutdown_mode: timeline::ShutdownMode,
) -> Result<(), completion::Barrier> {
span::debug_assert_current_span_has_tenant_id();
@@ -1824,16 +1830,8 @@ impl Tenant {
timelines.values().for_each(|timeline| {
let timeline = Arc::clone(timeline);
let timeline_id = timeline.timeline_id;
let span =
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
js.spawn(async move {
if freeze_and_flush {
timeline.flush_and_shutdown().instrument(span).await
} else {
timeline.shutdown().instrument(span).await
}
});
let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
})
};
// test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2063,7 +2061,12 @@ impl Tenant {
TenantState::Active { .. } => {
return Ok(());
}
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
TenantState::Broken { reason, .. } => {
// This is fatal, and reported distinctly from the general case of "will never be active" because
// it's logically a 500 to external API users (broken is always a bug).
return Err(GetActiveTenantError::Broken(reason));
}
TenantState::Stopping { .. } => {
// There's no chance the tenant can transition back into ::Active
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
}
@@ -2072,14 +2075,14 @@ impl Tenant {
}
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
self.tenant_conf.read().unwrap().location.attach_mode
self.tenant_conf.load().location.attach_mode
}
/// For API access: generate a LocationConfig equivalent to the one that would be used to
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
/// rare external API calls, like a reconciliation at startup.
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
let conf = self.tenant_conf.read().unwrap();
let conf = self.tenant_conf.load();
let location_config_mode = match conf.location.attach_mode {
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2141,7 +2144,7 @@ impl Tenant {
// Shut down the timeline's remote client: this means that the indices we write
// for child shards will not be invalidated by the parent shard deleting layers.
tl_client.shutdown().await?;
tl_client.shutdown().await;
// Download methods can still be used after shutdown, as they don't flow through the remote client's
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
@@ -2226,7 +2229,7 @@ where
impl Tenant {
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
self.tenant_conf.read().unwrap().tenant_conf.clone()
self.tenant_conf.load().tenant_conf.clone()
}
pub fn effective_config(&self) -> TenantConf {
@@ -2235,84 +2238,84 @@ impl Tenant {
}
pub fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.checkpoint_distance
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
}
pub fn get_checkpoint_timeout(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.checkpoint_timeout
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
pub fn get_compaction_target_size(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_target_size
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
}
pub fn get_compaction_period(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_period
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
}
pub fn get_compaction_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_threshold
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
pub fn get_gc_horizon(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.gc_horizon
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
}
pub fn get_gc_period(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.gc_period
.unwrap_or(self.conf.default_tenant_conf.gc_period)
}
pub fn get_image_creation_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.image_creation_threshold
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
}
pub fn get_pitr_interval(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.pitr_interval
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
}
pub fn get_trace_read_requests(&self) -> bool {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.trace_read_requests
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
}
pub fn get_min_resident_size_override(&self) -> Option<u64> {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.min_resident_size_override
.or(self.conf.default_tenant_conf.min_resident_size_override)
}
pub fn get_heatmap_period(&self) -> Option<Duration> {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let heatmap_period = tenant_conf
.heatmap_period
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2324,26 +2327,40 @@ impl Tenant {
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
self.tenant_conf_updated();
// Use read-copy-update in order to avoid overwriting the location config
// state if this races with [`Tenant::set_new_location_config`]. Note that
// this race is not possible if both request types come from the storage
// controller (as they should!) because an exclusive op lock is required
// on the storage controller side.
self.tenant_conf.rcu(|inner| {
Arc::new(AttachedTenantConf {
tenant_conf: new_tenant_conf.clone(),
location: inner.location,
})
});
self.tenant_conf_updated(&new_tenant_conf);
// Don't hold self.timelines.lock() during the notifies.
// There's no risk of deadlock right now, but there could be if we consolidate
// mutexes in struct Timeline in the future.
let timelines = self.list_timelines();
for timeline in timelines {
timeline.tenant_conf_updated();
timeline.tenant_conf_updated(&new_tenant_conf);
}
}
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
*self.tenant_conf.write().unwrap() = new_conf;
self.tenant_conf_updated();
let new_tenant_conf = new_conf.tenant_conf.clone();
self.tenant_conf.store(Arc::new(new_conf));
self.tenant_conf_updated(&new_tenant_conf);
// Don't hold self.timelines.lock() during the notifies.
// There's no risk of deadlock right now, but there could be if we consolidate
// mutexes in struct Timeline in the future.
let timelines = self.list_timelines();
for timeline in timelines {
timeline.tenant_conf_updated();
timeline.tenant_conf_updated(&new_tenant_conf);
}
}
@@ -2357,11 +2374,8 @@ impl Tenant {
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
}
pub(crate) fn tenant_conf_updated(&self) {
let conf = {
let guard = self.tenant_conf.read().unwrap();
Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
};
pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
self.timeline_get_throttle.reconfigure(conf)
}
@@ -2509,7 +2523,7 @@ impl Tenant {
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
&crate::metrics::tenant_throttling::TIMELINE_GET,
)),
tenant_conf: Arc::new(RwLock::new(attached_conf)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
}
}
@@ -3495,7 +3509,7 @@ impl Tenant {
}
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
self.tenant_conf.read().unwrap().tenant_conf.clone()
self.tenant_conf.load().tenant_conf.clone()
}
}
@@ -3643,6 +3657,9 @@ pub(crate) mod harness {
heatmap_period: Some(tenant_conf.heatmap_period),
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
}
}
}
@@ -3841,6 +3858,7 @@ mod tests {
use hex_literal::hex;
use pageserver_api::keyspace::KeySpace;
use rand::{thread_rng, Rng};
use tests::timeline::ShutdownMode;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4286,7 +4304,7 @@ mod tests {
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
.instrument(harness.span())
.await
.ok()
@@ -4327,7 +4345,7 @@ mod tests {
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
.instrument(harness.span())
.await
.ok()
@@ -5108,7 +5126,7 @@ mod tests {
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
let raw_tline = tline.raw_timeline().unwrap();
raw_tline
.shutdown()
.shutdown(super::timeline::ShutdownMode::Hard)
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
.await;
std::mem::forget(tline);

View File

@@ -57,6 +57,9 @@ pub mod defaults {
// throughputs up to 1GiB/s per timeline.
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
// By default ingest enough WAL for two new L0 layers before checking if new image
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
@@ -362,6 +365,10 @@ pub struct TenantConf {
pub lazy_slru_download: bool,
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
// How much WAL must be ingested before checking again whether a new image layer is required.
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
#[serde(skip_serializing_if = "Option::is_none")]
pub image_layer_creation_check_threshold: Option<u8>,
}
impl TenantConfOpt {
@@ -508,6 +518,9 @@ impl TenantConfOpt {
.timeline_get_throttle
.clone()
.unwrap_or(global_conf.timeline_get_throttle),
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
}
}
}
@@ -548,6 +561,7 @@ impl Default for TenantConf {
heatmap_period: Duration::ZERO,
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
}
}
}
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
}
}
}

View File

@@ -14,7 +14,10 @@ use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self, TaskKind},
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
tenant::{
mgr::{TenantSlot, TenantsMapRemoveResult},
timeline::ShutdownMode,
},
};
use super::{
@@ -111,6 +114,7 @@ async fn create_local_delete_mark(
let _ = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(&marker_path)
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
@@ -462,7 +466,7 @@ impl DeleteTenantFlow {
// tenant.shutdown
// Its also bad that we're holding tenants.read here.
// TODO relax set_stopping to be idempotent?
if tenant.shutdown(progress, false).await.is_err() {
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"tenant shutdown is already in progress"
)));

View File

@@ -72,6 +72,10 @@ impl EphemeralFile {
self.len
}
pub(crate) fn id(&self) -> page_cache::FileId {
self.page_cache_file_id
}
pub(crate) async fn read_blk(
&self,
blknum: u32,

View File

@@ -346,35 +346,6 @@ where
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub enum InMemoryLayerHandle {
Open {
lsn_floor: Lsn,
end_lsn: Lsn,
},
Frozen {
idx: usize,
lsn_floor: Lsn,
end_lsn: Lsn,
},
}
impl InMemoryLayerHandle {
pub fn get_lsn_floor(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
}
}
pub fn get_end_lsn(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
}
}
}
impl LayerMap {
///
/// Find the latest layer (by lsn.end) that covers the given
@@ -576,41 +547,18 @@ impl LayerMap {
self.historic.iter()
}
/// Get a handle for the first in memory layer that matches the provided predicate.
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
///
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
/// the same exclusive region established by holding the layer manager lock.
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
where
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
{
if let Some(open) = &self.open_layer {
if pred(open) {
return Some(InMemoryLayerHandle::Open {
lsn_floor: open.get_lsn_range().start,
end_lsn: open.get_lsn_range().end,
});
return Some(open.clone());
}
}
let pos = self.frozen_layers.iter().rev().position(pred);
pos.map(|rev_idx| {
let idx = self.frozen_layers.len() - 1 - rev_idx;
InMemoryLayerHandle::Frozen {
idx,
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
}
})
}
/// Get the layer pointed to by the provided handle.
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
match handle {
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
}
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
}
///

View File

@@ -4,7 +4,7 @@
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::{LocationConfigMode, ShardParameters};
use pageserver_api::models::LocationConfigMode;
use pageserver_api::shard::{
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
};
@@ -16,6 +16,7 @@ use std::collections::{BTreeMap, HashMap};
use std::ops::Deref;
use std::sync::Arc;
use std::time::{Duration, Instant};
use sysinfo::SystemExt;
use tokio::fs;
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
@@ -39,10 +40,11 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
TenantConfOpt,
};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
@@ -543,6 +545,18 @@ pub async fn init_tenant_mgr(
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
// Initialize dynamic limits that depend on system resources
let system_memory =
sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
.total_memory();
let max_ephemeral_layer_bytes =
conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
max_ephemeral_layer_bytes,
std::sync::atomic::Ordering::Relaxed,
);
// Scan local filesystem for attached tenants
let tenant_configs = init_load_tenant_configs(conf).await?;
@@ -770,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
join_set.spawn(
async move {
let freeze_and_flush = true;
let res = {
let (_guard, shutdown_progress) = completion::channel();
t.shutdown(shutdown_progress, freeze_and_flush).await
t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
};
if let Err(other_progress) = res {
@@ -875,16 +887,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
// caller will log how long we took
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum SetNewTenantConfigError {
#[error(transparent)]
GetTenant(#[from] GetTenantError),
#[error(transparent)]
Persist(anyhow::Error),
#[error(transparent)]
Other(anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum UpsertLocationError {
#[error("Bad config request: {0}")]
@@ -910,32 +912,21 @@ impl TenantManager {
self.conf
}
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
/// undergoing a state change (i.e. slot is InProgress).
///
/// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
/// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
pub(crate) fn get_attached_tenant_shard(
&self,
tenant_shard_id: TenantShardId,
active_only: bool,
) -> Result<Arc<Tenant>, GetTenantError> {
let locked = self.tenants.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
match peek_slot {
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
TenantState::Broken {
reason,
backtrace: _,
} if active_only => Err(GetTenantError::Broken(reason)),
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_shard_id))
} else {
Ok(Arc::clone(tenant))
}
}
},
Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
None | Some(TenantSlot::Secondary(_)) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
@@ -1115,7 +1106,7 @@ impl TenantManager {
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
@@ -1231,7 +1222,7 @@ impl TenantManager {
TenantSlot::Attached(tenant) => {
let (_guard, progress) = utils::completion::channel();
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
match tenant.shutdown(progress, false).await {
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
info!("Finished shutting down just-spawned tenant");
}
@@ -1281,7 +1272,7 @@ impl TenantManager {
};
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, false).await {
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
slot_guard.drop_old_value()?;
}
@@ -1428,7 +1419,8 @@ impl TenantManager {
.wait_to_become_active(activation_timeout)
.await
.map_err(|e| match e {
GetActiveTenantError::WillNotBecomeActive(_) => {
GetActiveTenantError::WillNotBecomeActive(_)
| GetActiveTenantError::Broken(_) => {
DeleteTenantError::InvalidState(tenant.current_state())
}
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
@@ -1455,29 +1447,30 @@ impl TenantManager {
result
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
pub(crate) async fn shard_split(
&self,
tenant_shard_id: TenantShardId,
tenant: Arc<Tenant>,
new_shard_count: ShardCount,
new_stripe_size: Option<ShardStripeSize>,
ctx: &RequestContext,
) -> anyhow::Result<Vec<TenantShardId>> {
let tenant_shard_id = *tenant.get_tenant_shard_id();
let r = self
.do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
.do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
.await;
if r.is_err() {
// Shard splitting might have left the original shard in a partially shut down state (it
// stops the shard's remote timeline client). Reset it to ensure we leave things in
// a working state.
if self.get(tenant_shard_id).is_some() {
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
tracing::warn!("Resetting after shard split failure");
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
// Log this error because our return value will still be the original error, not this one. This is
// a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
// (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or
// setting it broken probably won't help either.
tracing::error!("Failed to reset {tenant_shard_id}: {e}");
tracing::error!("Failed to reset: {e}");
}
}
}
@@ -1487,12 +1480,12 @@ impl TenantManager {
pub(crate) async fn do_shard_split(
&self,
tenant_shard_id: TenantShardId,
tenant: Arc<Tenant>,
new_shard_count: ShardCount,
new_stripe_size: Option<ShardStripeSize>,
ctx: &RequestContext,
) -> anyhow::Result<Vec<TenantShardId>> {
let tenant = get_tenant(tenant_shard_id, true)?;
let tenant_shard_id = *tenant.get_tenant_shard_id();
// Validate the incoming request
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
@@ -1538,7 +1531,6 @@ impl TenantManager {
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
// have been left in a partially-shut-down state.
tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
self.reset_tenant(tenant_shard_id, false, ctx).await?;
return Err(e);
}
@@ -1656,7 +1648,14 @@ impl TenantManager {
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
"failpoint"
)));
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
if let Err(e) = timeline
.wait_lsn(
*target_lsn,
crate::tenant::timeline::WaitLsnWaiter::Tenant,
ctx,
)
.await
{
// Failure here might mean shutdown, in any case this part is an optimization
// and we shouldn't hold up the split operation.
tracing::warn!(
@@ -1677,7 +1676,7 @@ impl TenantManager {
// Phase 5: Shut down the parent shard, and erase it from disk
let (_guard, progress) = completion::channel();
match parent.shutdown(progress, false).await {
match parent.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {}
Err(other) => {
other.wait().await;
@@ -1936,38 +1935,23 @@ impl TenantManager {
removal_result
}
pub(crate) async fn set_new_tenant_config(
pub(crate) fn list_tenants(
&self,
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> Result<(), SetNewTenantConfigError> {
// Legacy API: does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_shard_id, true)?;
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
// Note that we use ShardParameters::default below.
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
)));
}
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(
new_tenant_conf.clone(),
tenant.generation,
&ShardParameters::default(),
);
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
.await
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
let tenants = TENANTS.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => {
Some((*id, tenant.current_state(), tenant.generation()))
}
TenantSlot::Secondary(_) => None,
TenantSlot::InProgress(_) => None,
})
.collect())
}
}
@@ -1980,51 +1964,12 @@ pub(crate) enum GetTenantError {
#[error("Tenant {0} is not active")]
NotActive(TenantShardId),
/// Broken is logically a subset of NotActive, but a distinct error is useful as
/// NotActive is usually a retryable state for API purposes, whereas Broken
/// is a stuck error state
#[error("Tenant is broken: {0}")]
Broken(String),
// Initializing or shutting down: cannot authoritatively say whether we have this tenant
#[error("Tenant map is not available: {0}")]
MapState(#[from] TenantMapError),
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
///
/// This method is cancel-safe.
pub(crate) fn get_tenant(
tenant_shard_id: TenantShardId,
active_only: bool,
) -> Result<Arc<Tenant>, GetTenantError> {
let locked = TENANTS.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
match peek_slot {
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
TenantState::Broken {
reason,
backtrace: _,
} if active_only => Err(GetTenantError::Broken(reason)),
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_shard_id))
} else {
Ok(Arc::clone(tenant))
}
}
},
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
None | Some(TenantSlot::Secondary(_)) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
}
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GetActiveTenantError {
/// We may time out either while TenantSlot is InProgress, or while the Tenant
@@ -2048,6 +1993,12 @@ pub(crate) enum GetActiveTenantError {
/// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
#[error("will not become active. Current state: {0}")]
WillNotBecomeActive(TenantState),
/// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
/// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
/// never happen.
#[error("Tenant is broken: {0}")]
Broken(String),
}
/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
@@ -2267,27 +2218,6 @@ pub(crate) enum TenantMapListError {
Initializing,
}
///
/// Get list of tenants, for the mgmt API
///
pub(crate) async fn list_tenants(
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
let tenants = TENANTS.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => {
Some((*id, tenant.current_state(), tenant.generation()))
}
TenantSlot::Secondary(_) => None,
TenantSlot::InProgress(_) => None,
})
.collect())
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum TenantMapInsertError {
#[error(transparent)]
@@ -2733,11 +2663,11 @@ where
let attached_tenant = match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let freeze_and_flush = false;
let shutdown_mode = ShutdownMode::Hard;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match tenant.shutdown(progress, freeze_and_flush).await {
match tenant.shutdown(progress, shutdown_mode).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to

View File

@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::download::download_retry;
use crate::tenant::storage_layer::AsLayerDesc;
use crate::tenant::upload_queue::Delete;
use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
use crate::tenant::TIMELINES_SEGMENT_NAME;
use crate::{
config::PageServerConf,
@@ -266,15 +266,6 @@ pub enum MaybeDeletedIndexPart {
Deleted(IndexPart),
}
/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
#[derive(Debug, thiserror::Error)]
pub enum StopError {
/// Returned if the upload queue was never initialized.
/// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
#[error("queue is not initialized")]
QueueUninitialized,
}
#[derive(Debug, thiserror::Error)]
pub enum PersistIndexPartWithDeletedFlagError {
#[error("another task is already setting the deleted_flag, started at {0:?}")]
@@ -399,15 +390,10 @@ impl RemoteTimelineClient {
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
))?;
{
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
self.update_remote_physical_size_gauge(Some(index_part));
}
// also locks upload queue, without dropping the guard above it will be a deadlock
self.stop().expect("initialized line above");
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
self.update_remote_physical_size_gauge(Some(index_part));
self.stop_impl(&mut upload_queue);
upload_queue
.stopped_mut()
@@ -421,7 +407,8 @@ impl RemoteTimelineClient {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
UploadQueue::Stopped(q) => q
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
.upload_queue_for_deletion
.get_last_remote_consistent_lsn_projected(),
}
@@ -431,7 +418,8 @@ impl RemoteTimelineClient {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
UploadQueue::Stopped(q) => Some(
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
q.upload_queue_for_deletion
.get_last_remote_consistent_lsn_visible(),
),
@@ -898,7 +886,7 @@ impl RemoteTimelineClient {
/// Wait for all previously scheduled operations to complete, and then stop.
///
/// Not cancellation safe
pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
pub(crate) async fn shutdown(self: &Arc<Self>) {
// On cancellation the queue is left in ackward state of refusing new operations but
// proper stop is yet to be called. On cancel the original or some later task must call
// `stop` or `shutdown`.
@@ -909,8 +897,12 @@ impl RemoteTimelineClient {
let fut = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = match &mut *guard {
UploadQueue::Stopped(_) => return Ok(()),
UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
UploadQueue::Stopped(_) => return,
UploadQueue::Uninitialized => {
// transition into Stopped state
self.stop_impl(&mut guard);
return;
}
UploadQueue::Initialized(ref mut init) => init,
};
@@ -942,7 +934,7 @@ impl RemoteTimelineClient {
}
}
self.stop()
self.stop();
}
/// Set the deleted_at field in the remote index file.
@@ -1324,12 +1316,7 @@ impl RemoteTimelineClient {
// upload finishes or times out soon enough.
if cancel.is_cancelled() {
info!("upload task cancelled by shutdown request");
match self.stop() {
Ok(()) => {}
Err(StopError::QueueUninitialized) => {
unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
}
}
self.stop();
return;
}
@@ -1582,19 +1569,25 @@ impl RemoteTimelineClient {
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
///
/// In-progress operations will still be running after this function returns.
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
/// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
/// to wait for them to complete, after calling this function.
pub(crate) fn stop(&self) -> Result<(), StopError> {
pub(crate) fn stop(&self) {
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
let mut guard = self.upload_queue.lock().unwrap();
match &mut *guard {
UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
self.stop_impl(&mut guard);
}
fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
match &mut **guard {
UploadQueue::Uninitialized => {
info!("UploadQueue is in state Uninitialized, nothing to do");
**guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
}
UploadQueue::Stopped(_) => {
// nothing to do
info!("another concurrent task already shut down the queue");
Ok(())
}
UploadQueue::Initialized(initialized) => {
info!("shutting down upload queue");
@@ -1627,11 +1620,13 @@ impl RemoteTimelineClient {
};
let upload_queue = std::mem::replace(
&mut *guard,
UploadQueue::Stopped(UploadQueueStopped {
upload_queue_for_deletion,
deleted_at: SetDeletedFlagProgress::NotRunning,
}),
&mut **guard,
UploadQueue::Stopped(UploadQueueStopped::Deletable(
UploadQueueStoppedDeletable {
upload_queue_for_deletion,
deleted_at: SetDeletedFlagProgress::NotRunning,
},
)),
);
if let UploadQueue::Initialized(qi) = upload_queue {
qi
@@ -1660,10 +1655,6 @@ impl RemoteTimelineClient {
// which is exactly what we want to happen.
drop(op);
}
// We're done.
drop(guard);
Ok(())
}
}
}

View File

@@ -11,11 +11,11 @@ use crate::{
disk_usage_eviction_task::{
finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
},
is_temporary,
metrics::SECONDARY_MODE,
tenant::{
config::SecondaryLocationConfig,
debug_assert_current_span_has_tenant_and_timeline_id,
ephemeral_file::is_ephemeral_file,
remote_timeline_client::{
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
// Existing on-disk layers: just update their access time.
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
tracing::debug!("Layer {} is already on disk", layer.name);
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
let local_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id)
.join(layer.name.file_name());
match tokio::fs::metadata(&local_path).await {
Ok(meta) => {
tracing::debug!(
"Layer {} present at {}, size {}",
layer.name,
local_path,
meta.len(),
);
}
Err(e) => {
tracing::warn!(
"Layer {} not found at {} ({})",
layer.name,
local_path,
e
);
debug_assert!(false);
}
}
}
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{
@@ -964,7 +993,7 @@ async fn init_timeline_state(
continue;
} else if crate::is_temporary(&file_path)
|| is_temp_download_file(&file_path)
|| is_temporary(&file_path)
|| is_ephemeral_file(file_name)
{
// Temporary files are frequently left behind from restarting during downloads
tracing::info!("Cleaning up temporary file {file_path}");

View File

@@ -9,6 +9,7 @@ use crate::{
metrics::SECONDARY_MODE,
tenant::{
config::AttachmentMode,
mgr::GetTenantError,
mgr::TenantManager,
remote_timeline_client::remote_heatmap_path,
span::debug_assert_current_span_has_tenant_id,
@@ -292,8 +293,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
"Starting heatmap write on command");
let tenant = self
.tenant_manager
.get_attached_tenant_shard(*tenant_shard_id, true)
.get_attached_tenant_shard(*tenant_shard_id)
.map_err(|e| anyhow::anyhow!(e))?;
if !tenant.is_active() {
return Err(GetTenantError::NotActive(*tenant_shard_id).into());
}
Ok(UploadPending {
// Ignore our state for last digest: this forces an upload even if nothing has changed

View File

@@ -3,7 +3,7 @@
pub mod delta_layer;
mod filename;
pub mod image_layer;
mod inmemory_layer;
pub(crate) mod inmemory_layer;
pub(crate) mod layer;
mod layer_desc;
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::Mutex;
use std::sync::{Arc, Mutex};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
use super::layer_map::InMemoryLayerHandle;
use super::timeline::layer_manager::LayerManager;
use self::inmemory_layer::InMemoryLayerFileId;
use super::timeline::GetVectoredError;
use super::PageReconstructError;
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
}
}
/// Description of layer to be read - the layer map can turn
/// this description into the actual layer.
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub(crate) enum ReadableLayerDesc {
Persistent {
desc: PersistentLayerDesc,
lsn_range: Range<Lsn>,
},
InMemory {
handle: InMemoryLayerHandle,
lsn_ceil: Lsn,
},
/// A key that uniquely identifies a layer in a timeline
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub(crate) enum LayerId {
PersitentLayerId(PersistentLayerKey),
InMemoryLayerId(InMemoryLayerFileId),
}
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
#[derive(Debug)]
struct ReadableLayerDescOrdered(ReadableLayerDesc);
pub(crate) enum ReadableLayer {
PersistentLayer(Layer),
InMemoryLayer(Arc<InMemoryLayer>),
}
/// A partial description of a read to be done.
#[derive(Debug, Clone)]
struct ReadDesc {
/// An id used to resolve the readable layer within the fringe
layer_id: LayerId,
/// Lsn range for the read, used for selecting the next read
lsn_range: Range<Lsn>,
}
/// Data structure which maintains a fringe of layers for the
/// read path. The fringe is the set of layers which intersects
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
/// a two layer indexing scheme.
#[derive(Debug)]
pub(crate) struct LayerFringe {
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
layers: HashMap<ReadableLayerDesc, KeySpace>,
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
layers: HashMap<LayerId, LayerKeyspace>,
}
#[derive(Debug)]
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: KeySpace,
}
impl LayerFringe {
pub(crate) fn new() -> Self {
LayerFringe {
layers_by_lsn: BinaryHeap::new(),
planned_reads_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
let handle = match self.layers_by_lsn.pop() {
Some(h) => h,
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
let read_desc = match self.planned_reads_by_lsn.pop() {
Some(desc) => desc,
None => return None,
};
let removed = self.layers.remove_entry(&handle.0);
let removed = self.layers.remove_entry(&read_desc.layer_id);
match removed {
Some((layer, keyspace)) => Some((layer, keyspace)),
Some((
_,
LayerKeyspace {
layer,
target_keyspace,
},
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
None => unreachable!("fringe internals are always consistent"),
}
}
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
let entry = self.layers.entry(layer.clone());
pub(crate) fn update(
&mut self,
layer: ReadableLayer,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
) {
let layer_id = layer.id();
let entry = self.layers.entry(layer_id.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().merge(&keyspace);
entry.get_mut().target_keyspace.merge(&keyspace);
}
Entry::Vacant(entry) => {
self.layers_by_lsn
.push(ReadableLayerDescOrdered(entry.key().clone()));
entry.insert(keyspace);
self.planned_reads_by_lsn.push(ReadDesc {
lsn_range,
layer_id: layer_id.clone(),
});
entry.insert(LayerKeyspace {
layer,
target_keyspace: keyspace,
});
}
}
}
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
}
}
impl Ord for ReadableLayerDescOrdered {
impl Ord for ReadDesc {
fn cmp(&self, other: &Self) -> Ordering {
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
if ord == std::cmp::Ordering::Equal {
self.0
.get_lsn_floor()
.cmp(&other.0.get_lsn_floor())
.reverse()
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
} else {
ord
}
}
}
impl PartialOrd for ReadableLayerDescOrdered {
impl PartialOrd for ReadDesc {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ReadableLayerDescOrdered {
impl PartialEq for ReadDesc {
fn eq(&self, other: &Self) -> bool {
self.0.get_lsn_floor() == other.0.get_lsn_floor()
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
self.lsn_range == other.lsn_range
}
}
impl Eq for ReadableLayerDescOrdered {}
impl Eq for ReadDesc {}
impl ReadableLayerDesc {
pub(crate) fn get_lsn_floor(&self) -> Lsn {
impl ReadableLayer {
pub(crate) fn id(&self) -> LayerId {
match self {
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
}
}
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
match self {
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
}
}
pub(crate) async fn get_values_reconstruct_data(
&self,
layer_manager: &LayerManager,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
match self {
ReadableLayerDesc::Persistent { desc, lsn_range } => {
let layer = layer_manager.get_from_desc(desc);
ReadableLayer::PersistentLayer(layer) => {
layer
.get_values_reconstruct_data(
keyspace,
lsn_range.clone(),
reconstruct_state,
ctx,
)
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
.await
}
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
let layer = layer_manager
.layer_map()
.get_in_memory_layer(handle)
.unwrap();
ReadableLayer::InMemoryLayer(layer) => {
layer
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
.await
}
}

View File

@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::BytesMut;
use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
@@ -946,6 +947,34 @@ impl DeltaLayerInner {
Ok(planner.finish())
}
fn get_min_read_buffer_size(
planned_reads: &[VectoredRead],
read_size_soft_max: usize,
) -> usize {
let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
return read_size_soft_max;
};
let largest_read_size = largest_read.size();
if largest_read_size > read_size_soft_max {
// If the read is oversized, it should only contain one key.
let offenders = largest_read
.blobs_at
.as_slice()
.iter()
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
.join(", ");
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
largest_read_size,
read_size_soft_max,
offenders
);
}
largest_read_size
}
async fn do_reads_and_update_state(
&self,
reads: Vec<VectoredRead>,
@@ -959,7 +988,8 @@ impl DeltaLayerInner {
.expect("Layer is loaded with max vectored bytes config")
.0
.into();
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
let mut buf = Some(BytesMut::with_capacity(buf_size));
// Note that reads are processed in reverse order (from highest key+lsn).
// This is the order that `ReconstructState` requires such that it can
@@ -986,7 +1016,7 @@ impl DeltaLayerInner {
// We have "lost" the buffer since the lower level IO api
// doesn't return the buffer on error. Allocate a new one.
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
buf = Some(BytesMut::with_capacity(buf_size));
continue;
}
@@ -1210,9 +1240,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
mod test {
use std::collections::BTreeMap;
use itertools::MinMaxResult;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use rand::RngCore;
use super::*;
use crate::{
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
context::DownloadBehavior,
task_mgr::TaskKind,
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
DEFAULT_PG_VERSION,
};
/// Construct an index for a fictional delta layer and and then
@@ -1332,4 +1369,229 @@ mod test {
assert_eq!(planned_blobs, expected_blobs);
}
mod constants {
use utils::lsn::Lsn;
/// Offset used by all lsns in this test
pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
/// Number of unique keys including in the test data
pub(super) const KEY_COUNT: u8 = 60;
/// Max number of different lsns for each key
pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
/// Possible value sizes for each key along with a probability weight
pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
/// Probability that there will be a gap between the current key and the next one (33.3%)
pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
/// The minimum size of a key range in all the generated reads
pub(super) const MIN_RANGE_SIZE: i128 = 10;
/// The number of ranges included in each vectored read
pub(super) const RANGES_COUNT: u8 = 2;
/// The number of vectored reads performed
pub(super) const READS_COUNT: u8 = 100;
/// Soft max size of a vectored read. Will be violated if we have to read keys
/// with values larger than the limit
pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
}
struct Entry {
key: Key,
lsn: Lsn,
value: Vec<u8>,
}
fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
let mut current_key = Key::MIN;
let mut entries = Vec::new();
for _ in 0..constants::KEY_COUNT {
let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
let mut lsns_iter =
std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
Some(Lsn(lsn.0 + 0x08))
});
let mut lsns = Vec::new();
while lsns.len() < count as usize {
let take = rng.gen_bool(0.5);
let lsn = lsns_iter.next().unwrap();
if take {
lsns.push(lsn);
}
}
for lsn in lsns {
let size = constants::VALUE_SIZES
.choose_weighted(rng, |item| item.1)
.unwrap()
.0;
let mut buf = vec![0; size];
rng.fill_bytes(&mut buf);
entries.push(Entry {
key: current_key,
lsn,
value: buf,
})
}
let gap = constants::KEY_GAP_CHANGES
.choose_weighted(rng, |item| item.1)
.unwrap()
.0;
if gap {
current_key = current_key.add(2);
} else {
current_key = current_key.add(1);
}
}
entries
}
struct EntriesMeta {
key_range: Range<Key>,
lsn_range: Range<Lsn>,
index: BTreeMap<(Key, Lsn), Vec<u8>>,
}
fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
let key_range = match entries.iter().minmax_by_key(|e| e.key) {
MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
_ => panic!("More than one entry is always expected"),
};
let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
_ => panic!("More than one entry is always expected"),
};
let mut index = BTreeMap::new();
for entry in entries.iter() {
index.insert((entry.key, entry.lsn), entry.value.clone());
}
EntriesMeta {
key_range,
lsn_range,
index,
}
}
fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
let start = key_range.start.to_i128();
let end = key_range.end.to_i128();
let mut keyspace = KeySpace::default();
for _ in 0..constants::RANGES_COUNT {
let mut range: Option<Range<Key>> = Option::default();
while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
let range_start = rng.gen_range(start..end);
let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
if range_end_offset >= end {
range = Some(Key::from_i128(range_start)..Key::from_i128(end));
} else {
let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
}
}
keyspace.ranges.push(range.unwrap());
}
keyspace
}
#[tokio::test]
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
let (tenant, ctx) = harness.load().await;
let timeline_id = TimelineId::generate();
let timeline = tenant
.create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
.await?;
tracing::info!("Generating test data ...");
let rng = &mut StdRng::seed_from_u64(0);
let entries = generate_entries(rng);
let entries_meta = get_entries_meta(&entries);
tracing::info!("Done generating {} entries", entries.len());
tracing::info!("Writing test data to delta layer ...");
let mut writer = DeltaLayerWriter::new(
harness.conf,
timeline_id,
harness.tenant_shard_id,
entries_meta.key_range.start,
entries_meta.lsn_range.clone(),
)
.await?;
for entry in entries {
let (_, res) = writer
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
.await;
res?;
}
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
let inner = resident.get_inner_delta(&ctx).await?;
let file_size = inner.file.metadata().await?.len();
tracing::info!(
"Done writing test data to delta layer. Resulting file size is: {}",
file_size
);
for i in 0..constants::READS_COUNT {
tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
block_reader,
);
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
let mut reconstruct_state = ValuesReconstructState::new();
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
let vectored_reads = DeltaLayerInner::plan_reads(
keyspace.clone(),
entries_meta.lsn_range.clone(),
data_end_offset,
index_reader,
planner,
&mut reconstruct_state,
&ctx,
)
.await?;
let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
let buf_size = DeltaLayerInner::get_min_read_buffer_size(
&vectored_reads,
constants::MAX_VECTORED_READ_BYTES,
);
let mut buf = Some(BytesMut::with_capacity(buf_size));
for read in vectored_reads {
let blobs_buf = vectored_blob_reader
.read_blobs(&read, buf.take().expect("Should have a buffer"))
.await?;
for meta in blobs_buf.blobs.iter() {
let value = &blobs_buf.buf[meta.start..meta.end];
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
}
buf = Some(blobs_buf.buf);
}
}
Ok(())
}
}

View File

@@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
@@ -540,7 +541,25 @@ impl ImageLayerInner {
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
for read in reads.into_iter() {
let buf = BytesMut::with_capacity(max_vectored_read_bytes);
let buf_size = read.size();
if buf_size > max_vectored_read_bytes {
// If the read is oversized, it should only contain one key.
let offenders = read
.blobs_at
.as_slice()
.iter()
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
.join(", ");
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
buf_size,
max_vectored_read_bytes,
offenders
);
}
let buf = BytesMut::with_capacity(buf_size);
let res = vectored_blob_reader.read_blobs(&read, buf).await;
match res {

View File

@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::ValueReconstructResult;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::walrecord;
use crate::{page_cache, walrecord};
use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
@@ -23,8 +23,12 @@ use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// avoid binding to Write (conflicts with std::io::Write)
// while being able to use std::fmt::Write's methods
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
use std::cmp::Ordering;
use std::fmt::Write as _;
use std::ops::Range;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::atomic::{AtomicU64, AtomicUsize};
use tokio::sync::{RwLock, RwLockWriteGuard};
use super::{
@@ -32,10 +36,14 @@ use super::{
ValuesReconstructState,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
pub struct InMemoryLayer {
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
file_id: InMemoryLayerFileId,
/// This layer contains all the changes from 'start_lsn'. The
/// start is inclusive.
@@ -70,6 +78,8 @@ pub struct InMemoryLayerInner {
/// Each serialized Value is preceded by a 'u32' length field.
/// PerSeg::page_versions map stores offsets into this file.
file: EphemeralFile,
resource_units: GlobalResourceUnits,
}
impl std::fmt::Debug for InMemoryLayerInner {
@@ -78,7 +88,126 @@ impl std::fmt::Debug for InMemoryLayerInner {
}
}
/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
/// to minimize contention.
///
/// This global state is used to implement behaviors that require a global view of the system, e.g.
/// rolling layers proactively to limit the total amount of dirty data.
pub(crate) struct GlobalResources {
// Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
// Zero means unlimited.
pub(crate) max_dirty_bytes: AtomicU64,
// How many bytes are in all EphemeralFile objects
dirty_bytes: AtomicU64,
// How many layers are contributing to dirty_bytes
dirty_layers: AtomicUsize,
}
// Per-timeline RAII struct for its contribution to [`GlobalResources`]
struct GlobalResourceUnits {
// How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
// for decrementing the global counter by this many bytes when dropped.
dirty_bytes: u64,
}
impl GlobalResourceUnits {
// Hint for the layer append path to update us when the layer size differs from the last
// call to update_size by this much. If we don't reach this threshold, we'll still get
// updated when the Timeline "ticks" in the background.
const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
fn new() -> Self {
GLOBAL_RESOURCES
.dirty_layers
.fetch_add(1, AtomicOrdering::Relaxed);
Self { dirty_bytes: 0 }
}
/// Do not call this frequently: all timelines will write to these same global atomics,
/// so this is a relatively expensive operation. Wait at least a few seconds between calls.
///
/// Returns the effective layer size limit that should be applied, if any, to keep
/// the total number of dirty bytes below the configured maximum.
fn publish_size(&mut self, size: u64) -> Option<u64> {
let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
Ordering::Greater => {
let delta = size - self.dirty_bytes;
let old = GLOBAL_RESOURCES
.dirty_bytes
.fetch_add(delta, AtomicOrdering::Relaxed);
old + delta
}
Ordering::Less => {
let delta = self.dirty_bytes - size;
let old = GLOBAL_RESOURCES
.dirty_bytes
.fetch_sub(delta, AtomicOrdering::Relaxed);
old - delta
}
};
// This is a sloppy update: concurrent updates to the counter will race, and the exact
// value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
// That's okay: as long as the metric contains some recent value, it doesn't have to always
// be literally the last update.
TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
self.dirty_bytes = size;
let max_dirty_bytes = GLOBAL_RESOURCES
.max_dirty_bytes
.load(AtomicOrdering::Relaxed);
if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
// Set the layer file limit to the average layer size: this implies that all above-average
// sized layers will be elegible for freezing. They will be frozen in the order they
// next enter publish_size.
Some(
new_global_dirty_bytes
/ GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
)
} else {
None
}
}
// Call publish_size if the input size differs from last published size by more than
// the drift limit
fn maybe_publish_size(&mut self, size: u64) {
let publish = match size.cmp(&self.dirty_bytes) {
Ordering::Equal => false,
Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
};
if publish {
self.publish_size(size);
}
}
}
impl Drop for GlobalResourceUnits {
fn drop(&mut self) {
GLOBAL_RESOURCES
.dirty_layers
.fetch_sub(1, AtomicOrdering::Relaxed);
// Subtract our contribution to the global total dirty bytes
self.publish_size(0);
}
}
pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
max_dirty_bytes: AtomicU64::new(0),
dirty_bytes: AtomicU64::new(0),
dirty_layers: AtomicUsize::new(0),
};
impl InMemoryLayer {
pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
self.file_id
}
pub(crate) fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
@@ -93,6 +222,10 @@ impl InMemoryLayer {
}
}
pub(crate) fn try_len(&self) -> Option<u64> {
self.inner.try_read().map(|i| i.file.len()).ok()
}
pub(crate) fn assert_writable(&self) {
assert!(self.end_lsn.get().is_none());
}
@@ -318,8 +451,10 @@ impl InMemoryLayer {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
let key = InMemoryLayerFileId(file.id());
Ok(InMemoryLayer {
file_id: key,
conf,
timeline_id,
tenant_shard_id,
@@ -328,6 +463,7 @@ impl InMemoryLayer {
inner: RwLock::new(InMemoryLayerInner {
index: HashMap::new(),
file,
resource_units: GlobalResourceUnits::new(),
}),
})
}
@@ -378,9 +514,18 @@ impl InMemoryLayer {
warn!("Key {} at {} already exists", key, lsn);
}
let size = locked_inner.file.len();
locked_inner.resource_units.maybe_publish_size(size);
Ok(())
}
pub(crate) async fn tick(&self) -> Option<u64> {
let mut inner = self.inner.write().await;
let size = inner.file.len();
inner.resource_units.publish_size(size)
}
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
// TODO: Currently, we just leak the storage for any deleted keys
Ok(())

View File

@@ -1759,6 +1759,18 @@ impl ResidentLayer {
pub(crate) fn metadata(&self) -> LayerFileMetadata {
self.owner.metadata()
}
#[cfg(test)]
pub(crate) async fn get_inner_delta<'a>(
&'a self,
ctx: &RequestContext,
) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
let owner = &self.owner.0;
match self.downloaded.get(owner, ctx).await? {
LayerKind::Delta(d) => Ok(d),
LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
}
}
}
impl AsLayerDesc for ResidentLayer {

View File

@@ -9,6 +9,7 @@ pub mod uninit;
mod walreceiver;
use anyhow::{anyhow, bail, ensure, Context, Result};
use arc_swap::ArcSwap;
use bytes::Bytes;
use camino::Utf8Path;
use enumset::EnumSet;
@@ -19,7 +20,7 @@ use pageserver_api::{
keyspace::KeySpaceAccum,
models::{
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
EvictionPolicy, LayerMapInfo, TimelineState,
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
},
reltag::BlockNumber,
shard::{ShardIdentity, TenantShardId},
@@ -54,6 +55,7 @@ use std::{
ops::ControlFlow,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
@@ -64,7 +66,6 @@ use crate::{
disk_usage_eviction_task::DiskUsageEvictionInfo,
pgdatadir_mapping::CollectKeySpaceError,
};
use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
use crate::{
disk_usage_eviction_task::finite_f32,
tenant::storage_layer::{
@@ -118,11 +119,11 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::remote_timeline_client::RemoteTimelineClient;
use super::config::TenantConf;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
@@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState {
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
myself: Weak<Self>,
@@ -309,6 +310,8 @@ pub struct Timeline {
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
last_image_layer_creation_check_at: AtomicLsn,
/// Current logical size of the "datadir", at the last LSN.
current_logical_size: LogicalSize,
@@ -610,6 +613,25 @@ pub enum GetVectoredImpl {
Vectored,
}
pub(crate) enum WaitLsnWaiter<'a> {
Timeline(&'a Timeline),
Tenant,
PageService,
}
/// Argument to [`Timeline::shutdown`].
#[derive(Debug, Clone, Copy)]
pub(crate) enum ShutdownMode {
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
///
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
/// the call to [`Timeline::shutdown`].
FreezeAndFlush,
/// Shut down immediately, without waiting for any open layers to flush.
Hard,
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -1058,7 +1080,8 @@ impl Timeline {
pub(crate) async fn wait_lsn(
&self,
lsn: Lsn,
_ctx: &RequestContext, /* Prepare for use by cancellation */
who_is_waiting: WaitLsnWaiter<'_>,
ctx: &RequestContext, /* Prepare for use by cancellation */
) -> Result<(), WaitLsnError> {
if self.cancel.is_cancelled() {
return Err(WaitLsnError::Shutdown);
@@ -1066,20 +1089,28 @@ impl Timeline {
return Err(WaitLsnError::BadState);
}
// This should never be called from the WAL receiver, because that could lead
// to a deadlock.
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
"wait_lsn cannot be called in WAL receiver"
);
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
"wait_lsn cannot be called in WAL receiver"
);
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
"wait_lsn cannot be called in WAL receiver"
);
if cfg!(debug_assertions) {
match ctx.task_kind() {
TaskKind::WalReceiverManager
| TaskKind::WalReceiverConnectionHandler
| TaskKind::WalReceiverConnectionPoller => {
let is_myself = match who_is_waiting {
WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
};
if is_myself {
if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
// walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
}
} else {
// if another timeline's is waiting for us, there's no deadlock risk because
// our walreceiver task can make progress independent of theirs
}
}
_ => {}
}
}
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
@@ -1142,6 +1173,79 @@ impl Timeline {
self.flush_frozen_layers_and_wait().await
}
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
///
/// This is for use in background housekeeping, to provide guarantees of layers closing eventually
/// even if there are no ongoing writes to drive that.
async fn maybe_freeze_ephemeral_layer(&self) {
let Ok(_write_guard) = self.write_lock.try_lock() else {
// If the write lock is held, there is an active wal receiver: rolling open layers
// is their responsibility while they hold this lock.
return;
};
let Ok(layers_guard) = self.layers.try_read() else {
// Don't block if the layer lock is busy
return;
};
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
// No open layer, no work to do.
return;
};
let Some(current_size) = open_layer.try_len() else {
// Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
// read lock to get size should always succeed.
tracing::warn!("Lock conflict while reading size of open layer");
return;
};
let current_lsn = self.get_last_record_lsn();
let checkpoint_distance_override = open_layer.tick().await;
if let Some(size_override) = checkpoint_distance_override {
if current_size > size_override {
// This is not harmful, but it only happens in relatively rare cases where
// time-based checkpoints are not happening fast enough to keep the amount of
// ephemeral data within configured limits. It's a sign of stress on the system.
tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
}
}
let checkpoint_distance =
checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
if self.should_roll(
current_size,
current_size,
checkpoint_distance,
self.get_last_record_lsn(),
self.last_freeze_at.load(),
*self.last_freeze_ts.read().unwrap(),
) {
match open_layer.info() {
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
// We may reach this point if the layer was already frozen by not yet flushed: flushing
// happens asynchronously in the background.
tracing::debug!(
"Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
);
}
InMemoryLayerInfo::Open { .. } => {
// Upgrade to a write lock and freeze the layer
drop(layers_guard);
let mut layers_guard = self.layers.write().await;
layers_guard
.try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
.await;
}
}
self.flush_frozen_layers();
}
}
/// Outermost timeline compaction operation; downloads needed layers.
pub(crate) async fn compact(
self: &Arc<Self>,
@@ -1164,6 +1268,11 @@ impl Timeline {
(guard, permit)
};
// Prior to compaction, check if an open ephemeral layer should be closed: this provides
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
// an ephemeral layer open forever when idle.
self.maybe_freeze_ephemeral_layer().await;
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
@@ -1196,6 +1305,7 @@ impl Timeline {
pub(crate) fn activate(
self: &Arc<Self>,
parent: Arc<crate::tenant::Tenant>,
broker_client: BrokerClientChannel,
background_jobs_can_start: Option<&completion::Barrier>,
ctx: &RequestContext,
@@ -1206,95 +1316,122 @@ impl Timeline {
}
self.launch_wal_receiver(ctx, broker_client);
self.set_state(TimelineState::Active);
self.launch_eviction_task(background_jobs_can_start);
self.launch_eviction_task(parent, background_jobs_can_start);
}
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
/// After this function returns, there are no timeline-scoped tasks are left running.
///
/// While we are flushing, we continue to accept read I/O.
pub(crate) async fn flush_and_shutdown(&self) {
/// The preferred pattern for is:
/// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
/// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
/// go the extra mile and keep track of JoinHandles
/// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
/// instead of spawning directly on a runtime. It is a more composable / testable pattern.
///
/// For legacy reasons, we still have multiple tasks spawned using
/// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
/// We refer to these as "timeline-scoped task_mgr tasks".
/// Some of these tasks are already sensitive to Timeline::cancel while others are
/// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
/// or [`task_mgr::shutdown_watcher`].
/// We want to gradually convert the code base away from these.
///
/// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
/// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
/// ones that aren't mentioned here):
/// - [`TaskKind::TimelineDeletionWorker`]
/// - NB: also used for tenant deletion
/// - [`TaskKind::RemoteUploadTask`]`
/// - [`TaskKind::InitialLogicalSizeCalculation`]
/// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
// Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
/// - [`TaskKind::Eviction`]
/// - [`TaskKind::LayerFlushTask`]
/// - [`TaskKind::OndemandLogicalSizeCalculation`]
/// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
debug_assert_current_span_has_tenant_and_timeline_id();
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
// trying to flush
tracing::debug!("Waiting for WalReceiverManager...");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
let try_freeze_and_flush = match mode {
ShutdownMode::FreezeAndFlush => true,
ShutdownMode::Hard => false,
};
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
// Regardless of whether we're going to try_freeze_and_flush
// or not, stop ingesting any more data. Walreceiver only provides
// cancellation but no "wait until gone", because it uses the Timeline::gate.
// So, only after the self.gate.close() below will we know for sure that
// no walreceiver tasks are left.
// For `try_freeze_and_flush=true`, this means that we might still be ingesting
// data during the call to `self.freeze_and_flush()` below.
// That's not ideal, but, we don't have the concept of a ChildGuard,
// which is what we'd need to properly model early shutdown of the walreceiver
// task sub-tree before the other Timeline task sub-trees.
let walreceiver = self.walreceiver.lock().unwrap().take();
tracing::debug!(
is_some = walreceiver.is_some(),
"Waiting for WalReceiverManager..."
);
if let Some(walreceiver) = walreceiver {
walreceiver.cancel();
}
// ... and inform any waiters for newer LSNs that there won't be any.
self.last_record_lsn.shutdown();
// now all writers to InMemory layer are gone, do the final flush if requested
match self.freeze_and_flush().await {
Ok(_) => {
// drain the upload queue
if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
if let Err(e) = client.shutdown().await {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to flush to remote storage: {e:#}");
if try_freeze_and_flush {
// we shut down walreceiver above, so, we won't add anything more
// to the InMemoryLayer; freeze it and wait for all frozen layers
// to reach the disk & upload queue, then shut the upload queue and
// wait for it to drain.
match self.freeze_and_flush().await {
Ok(_) => {
// drain the upload queue
if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.shutdown().await;
}
}
}
Err(e) => {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to freeze and flush: {e:#}");
Err(e) => {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to freeze and flush: {e:#}");
}
}
}
self.shutdown().await;
}
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
/// the graceful [`Timeline::flush_and_shutdown`] function.
pub(crate) async fn shutdown(&self) {
debug_assert_current_span_has_tenant_and_timeline_id();
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
// while doing so.
self.last_record_lsn.shutdown();
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
// case our caller wants to use that for a deletion
// Transition the remote_client into a state where it's only useful for timeline deletion.
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
if let Some(remote_client) = self.remote_client.as_ref() {
match remote_client.stop() {
Ok(()) => {}
Err(StopError::QueueUninitialized) => {
// Shutting down during initialization is legal
}
}
remote_client.stop();
// As documented in remote_client.stop()'s doc comment, it's our responsibility
// to shut down the upload queue tasks.
// TODO: fix that, task management should be encapsulated inside remote_client.
task_mgr::shutdown_tasks(
Some(TaskKind::RemoteUploadTask),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
}
// TODO: work toward making this a no-op. See this funciton's doc comment for more context.
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
// Finally wait until any gate-holders are complete
// Finally wait until any gate-holders are complete.
//
// TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
// and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
self.gate.close().await;
self.metrics.shutdown();
@@ -1443,6 +1580,53 @@ impl Timeline {
Err(EvictionError::Timeout) => Ok(Some(false)),
}
}
fn should_roll(
&self,
layer_size: u64,
projected_layer_size: u64,
checkpoint_distance: u64,
projected_lsn: Lsn,
last_freeze_at: Lsn,
last_freeze_ts: Instant,
) -> bool {
let distance = projected_lsn.widening_sub(last_freeze_at);
// Rolling the open layer can be triggered by:
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
// account for how writes are distributed across shards: we expect each node to consume
// 1/count of the LSN on average.
// 2. The size of the currently open layer.
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
// up and suspend activity.
if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
info!(
"Will roll layer at {} with layer size {} due to LSN distance ({})",
projected_lsn, layer_size, distance
);
true
} else if projected_layer_size >= checkpoint_distance {
info!(
"Will roll layer at {} with layer size {} due to layer size ({})",
projected_lsn, layer_size, projected_layer_size
);
true
} else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
info!(
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
projected_lsn,
layer_size,
last_freeze_ts.elapsed()
);
true
} else {
false
}
}
}
/// Number of times we will compute partition within a checkpoint distance.
@@ -1451,57 +1635,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
pub(crate) fn get_lazy_slru_download(&self) -> bool {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.lazy_slru_download
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
}
fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.checkpoint_distance
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
}
fn get_checkpoint_timeout(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.checkpoint_timeout
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
fn get_compaction_target_size(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.compaction_target_size
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
}
fn get_compaction_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.compaction_threshold
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
fn get_image_creation_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.image_creation_threshold
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
}
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
let tenant_conf = &self.tenant_conf.load();
tenant_conf
.tenant_conf
.compaction_algorithm
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
}
fn get_eviction_policy(&self) -> EvictionPolicy {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.eviction_policy
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
}
@@ -1515,14 +1707,26 @@ impl Timeline {
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
}
pub(super) fn tenant_conf_updated(&self) {
fn get_image_layer_creation_check_threshold(&self) -> u8 {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.image_layer_creation_check_threshold
.unwrap_or(
self.conf
.default_tenant_conf
.image_layer_creation_check_threshold,
)
}
pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
// NB: Most tenant conf options are read by background loops, so,
// changes will automatically be picked up.
// The threshold is embedded in the metric. So, we need to update it.
{
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
&self.tenant_conf.read().unwrap().tenant_conf,
new_conf,
&self.conf.default_tenant_conf,
);
@@ -1549,7 +1753,7 @@ impl Timeline {
#[allow(clippy::too_many_arguments)]
pub(super) fn new(
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
metadata: &TimelineMetadata,
ancestor: Option<Arc<Timeline>>,
timeline_id: TimelineId,
@@ -1568,14 +1772,13 @@ impl Timeline {
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let tenant_conf_guard = tenant_conf.read().unwrap();
let evictions_low_residence_duration_metric_threshold =
let evictions_low_residence_duration_metric_threshold = {
let loaded_tenant_conf = tenant_conf.load();
Self::get_evictions_low_residence_duration_metric_threshold(
&tenant_conf_guard.tenant_conf,
&loaded_tenant_conf.tenant_conf,
&conf.default_tenant_conf,
);
drop(tenant_conf_guard);
)
};
Arc::new_cyclic(|myself| {
let mut result = Timeline {
@@ -1652,6 +1855,7 @@ impl Timeline {
},
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
repartition_threshold: 0,
last_image_layer_creation_check_at: AtomicLsn::new(0),
last_received_wal: Mutex::new(None),
rel_size_cache: RwLock::new(HashMap::new()),
@@ -1680,6 +1884,7 @@ impl Timeline {
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
result
.metrics
.last_record_gauge
@@ -1756,20 +1961,19 @@ impl Timeline {
self.timeline_id, self.tenant_shard_id
);
let tenant_conf_guard = self.tenant_conf.read().unwrap();
let wal_connect_timeout = tenant_conf_guard
let tenant_conf = self.tenant_conf.load();
let wal_connect_timeout = tenant_conf
.tenant_conf
.walreceiver_connect_timeout
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
let lagging_wal_timeout = tenant_conf_guard
let lagging_wal_timeout = tenant_conf
.tenant_conf
.lagging_wal_timeout
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
let max_lsn_wal_lag = tenant_conf_guard
let max_lsn_wal_lag = tenant_conf
.tenant_conf
.max_lsn_wal_lag
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
drop(tenant_conf_guard);
let mut guard = self.walreceiver.lock().unwrap();
assert!(
@@ -2317,10 +2521,6 @@ impl Timeline {
debug!("cancelling logical size calculation for timeline shutdown");
calculation.await
}
_ = task_mgr::shutdown_watcher() => {
debug!("cancelling logical size calculation for task shutdown");
calculation.await
}
}
}
@@ -2596,6 +2796,10 @@ impl Timeline {
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
let open_layer = open_layer.clone();
drop(guard);
result = match open_layer
.get_value_reconstruct_data(
key,
@@ -2613,10 +2817,7 @@ impl Timeline {
traversal_path.push((
result,
cont_lsn,
Box::new({
let open_layer = Arc::clone(open_layer);
move || open_layer.traversal_id()
}),
Box::new(move || open_layer.traversal_id()),
));
continue 'outer;
}
@@ -2626,6 +2827,10 @@ impl Timeline {
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
let lsn_floor = max(cached_lsn + 1, start_lsn);
let frozen_layer = frozen_layer.clone();
drop(guard);
result = match frozen_layer
.get_value_reconstruct_data(
key,
@@ -2643,10 +2848,7 @@ impl Timeline {
traversal_path.push((
result,
cont_lsn,
Box::new({
let frozen_layer = Arc::clone(frozen_layer);
move || frozen_layer.traversal_id()
}),
Box::new(move || frozen_layer.traversal_id()),
));
continue 'outer;
}
@@ -2654,6 +2856,8 @@ impl Timeline {
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
let layer = guard.get_from_desc(&layer);
drop(guard);
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -2771,16 +2975,6 @@ impl Timeline {
let mut completed_keyspace = KeySpace::default();
// Hold the layer map whilst visiting the timeline to prevent
// compaction, eviction and flushes from rendering the layers unreadable.
//
// TODO: Do we actually need to do this? In theory holding on
// to [`tenant::storage_layer::Layer`] should be enough. However,
// [`Timeline::get`] also holds the lock during IO, so more investigation
// is needed.
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
loop {
if cancel.is_cancelled() {
return Err(GetVectoredError::Cancelled);
@@ -2790,6 +2984,9 @@ impl Timeline {
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
completed_keyspace.merge(&keys_done_last_step);
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
@@ -2797,12 +2994,11 @@ impl Timeline {
match in_memory_layer {
Some(l) => {
let lsn_range = l.get_lsn_range().start..cont_lsn;
fringe.update(
ReadableLayerDesc::InMemory {
handle: l,
lsn_ceil: cont_lsn,
},
ReadableLayer::InMemoryLayer(l),
unmapped_keyspace.clone(),
lsn_range,
);
}
None => {
@@ -2814,30 +3010,43 @@ impl Timeline {
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
ReadableLayerDesc::Persistent {
desc: (*layer).clone(),
lsn_range: lsn_floor..cont_lsn,
},
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
}
}
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
// It's safe to drop the layer map lock after planning the next round of reads.
// The fringe keeps readable handles for the layers which are safe to read even
// if layers were compacted or flushed.
//
// The more interesting consideration is: "Why is the read algorithm still correct
// if the layer map changes while it is operating?". Doing a vectored read on a
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
// covered by the read. The layer map tells us how to move the lsn downwards for a
// range at *a particular point in time*. It is fine for the answer to be different
// at two different time points.
drop(guard);
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
let next_cont_lsn = lsn_range.start;
layer_to_read
.get_values_reconstruct_data(
&guard,
keyspace_to_read.clone(),
lsn_range,
reconstruct_state,
ctx,
)
.await?;
unmapped_keyspace = keyspace_to_read;
cont_lsn = layer_to_read.get_lsn_floor();
cont_lsn = next_cont_lsn;
} else {
break;
}
@@ -2915,7 +3124,7 @@ impl Timeline {
}
}
ancestor
.wait_lsn(self.ancestor_lsn, ctx)
.wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
@@ -2995,16 +3204,11 @@ impl Timeline {
loop {
tokio::select! {
_ = self.cancel.cancelled() => {
info!("shutting down layer flush task");
break;
},
_ = task_mgr::shutdown_watcher() => {
info!("shutting down layer flush task");
info!("shutting down layer flush task due to Timeline::cancel");
break;
},
_ = layer_flush_start_rx.changed() => {}
}
trace!("waking up");
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
@@ -3380,6 +3584,24 @@ impl Timeline {
// Is it time to create a new image layer for the given partition?
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
let last = self.last_image_layer_creation_check_at.load();
if lsn != Lsn(0) {
let distance = lsn
.checked_sub(last)
.expect("Attempt to compact with LSN going backwards");
let min_distance = self.get_image_layer_creation_check_threshold() as u64
* self.get_checkpoint_distance();
// Skip the expensive delta layer counting below if we've not ingested
// sufficient WAL since the last check.
if distance.0 < min_distance {
return false;
}
}
self.last_image_layer_creation_check_at.store(lsn);
let threshold = self.get_image_creation_threshold();
let guard = self.layers.read().await;
@@ -3721,6 +3943,24 @@ impl Timeline {
Ok(())
}
/// Schedules the uploads of the given image layers
fn upload_new_image_layers(
self: &Arc<Self>,
new_images: impl IntoIterator<Item = ResidentLayer>,
) -> anyhow::Result<()> {
let Some(remote_client) = &self.remote_client else {
return Ok(());
};
for layer in new_images {
remote_client.schedule_layer_file_upload(layer)?;
}
// should any new image layer been created, not uploading index_part will
// result in a mismatch between remote_physical_size and layermap calculated
// size, which will fail some tests, but should not be an issue otherwise.
remote_client.schedule_index_upload_for_file_changes()?;
Ok(())
}
/// Update information about which layer files need to be retained on
/// garbage collection. This is separate from actually performing the GC,
/// and is updated more frequently, so that compaction can remove obsolete
@@ -4460,49 +4700,6 @@ impl<'a> TimelineWriter<'a> {
res
}
/// "Tick" the timeline writer: it will roll the open layer if required
/// and do nothing else.
pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
self.open_layer_if_present().await?;
let last_record_lsn = self.get_last_record_lsn();
let action = self.get_open_layer_action(last_record_lsn, 0);
if action == OpenLayerAction::Roll {
self.roll_layer(last_record_lsn).await?;
}
Ok(())
}
/// Populate the timeline writer state only if an in-memory layer
/// is already open.
async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
assert!(self.write_guard.is_none());
let open_layer = {
let guard = self.layers.read().await;
let layers = guard.layer_map();
match layers.open_layer {
Some(ref open_layer) => open_layer.clone(),
None => {
return Ok(());
}
}
};
let initial_size = open_layer.size().await?;
let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
self.write_guard.replace(TimelineWriterState::new(
open_layer,
initial_size,
last_freeze_at,
last_freeze_ts,
));
Ok(())
}
async fn handle_open_layer_action(
&mut self,
at: Lsn,
@@ -4574,43 +4771,14 @@ impl<'a> TimelineWriter<'a> {
return OpenLayerAction::None;
}
let distance = lsn.widening_sub(state.cached_last_freeze_at);
let proposed_open_layer_size = state.current_size + new_value_size;
// Rolling the open layer can be triggered by:
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
// account for how writes are distributed across shards: we expect each node to consume
// 1/count of the LSN on average.
// 2. The size of the currently open layer.
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
// up and suspend activity.
if distance
>= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
{
info!(
"Will roll layer at {} with layer size {} due to LSN distance ({})",
lsn, state.current_size, distance
);
OpenLayerAction::Roll
} else if proposed_open_layer_size >= self.get_checkpoint_distance() {
info!(
"Will roll layer at {} with layer size {} due to layer size ({})",
lsn, state.current_size, proposed_open_layer_size
);
OpenLayerAction::Roll
} else if distance > 0
&& state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
{
info!(
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
lsn,
state.current_size,
state.cached_last_freeze_ts.elapsed()
);
if self.tl.should_roll(
state.current_size,
state.current_size + new_value_size,
self.get_checkpoint_distance(),
lsn,
state.cached_last_freeze_at,
state.cached_last_freeze_ts,
) {
OpenLayerAction::Roll
} else {
OpenLayerAction::None

View File

@@ -125,18 +125,8 @@ impl Timeline {
)
.await
.map_err(anyhow::Error::from)?;
if let Some(remote_client) = &self.remote_client {
for layer in layers {
remote_client.schedule_layer_file_upload(layer)?;
}
}
if let Some(remote_client) = &self.remote_client {
// should any new image layer been created, not uploading index_part will
// result in a mismatch between remote_physical_size and layermap calculated
// size, which will fail some tests, but should not be an issue otherwise.
remote_client.schedule_index_upload_for_file_changes()?;
}
self.upload_new_image_layers(layers)?;
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -818,7 +808,10 @@ impl TimelineAdaptor {
self.timeline
.finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
.await?;
self.new_images.clear();
self.timeline
.upload_new_image_layers(std::mem::take(&mut self.new_images))?;
self.new_deltas.clear();
self.layers_to_delete.clear();
Ok(())

View File

@@ -6,7 +6,7 @@ use std::{
use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use tokio::sync::OwnedMutexGuard;
use tracing::{debug, error, info, instrument, Instrument};
use tracing::{error, info, instrument, Instrument};
use utils::{crashsafe, fs_ext, id::TimelineId};
use crate::{
@@ -14,81 +14,14 @@ use crate::{
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
debug_assert_current_span_has_tenant_and_timeline_id,
metadata::TimelineMetadata,
remote_timeline_client::{
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
},
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
CreateTimelineCause, DeleteTimelineError, Tenant,
},
};
use super::{Timeline, TimelineResources};
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// Notify any timeline work to drop out of loops/requests
tracing::debug!("Cancelling CancellationToken");
timeline.cancel.cancel();
// Stop the walreceiver first.
debug!("waiting for wal receiver to shutdown");
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
if let Some(walreceiver) = maybe_started_walreceiver {
walreceiver.stop().await;
}
debug!("wal receiver shutdown confirmed");
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(timeline.tenant_shard_id),
Some(timeline.timeline_id),
)
.await;
// Prevent new uploads from starting.
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.stop();
match res {
Ok(()) => {}
Err(e) => match e {
remote_timeline_client::StopError::QueueUninitialized => {
// This case shouldn't happen currently because the
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
// That is, before we declare the Tenant as Active.
// But we only allow calls to delete_timeline on Active tenants.
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
}
},
}
}
// Stop & wait for the remaining timeline tasks, including upload tasks.
// NB: This and other delete_timeline calls do not run as a task_mgr task,
// so, they are not affected by this shutdown_tasks() call.
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(
None,
Some(timeline.tenant_shard_id),
Some(timeline.timeline_id),
)
.await;
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-deleted-at"
))?
});
tracing::debug!("Waiting for gate...");
timeline.gate.close().await;
tracing::debug!("Shutdown complete");
Ok(())
}
/// Mark timeline as deleted in S3 so we won't pick it up next time
/// during attach or pageserver restart.
/// See comment in persist_index_part_with_deleted_flag.
@@ -282,7 +215,14 @@ impl DeleteTimelineFlow {
guard.mark_in_progress()?;
stop_tasks(&timeline).await?;
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Hard).await;
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-deleted-at"
))?
});
set_deleted_in_remote_index(&timeline).await?;

View File

@@ -51,6 +51,7 @@ pub struct EvictionTaskTenantState {
impl Timeline {
pub(super) fn launch_eviction_task(
self: &Arc<Self>,
parent: Arc<Tenant>,
background_tasks_can_start: Option<&completion::Barrier>,
) {
let self_clone = Arc::clone(self);
@@ -66,20 +67,19 @@ impl Timeline {
),
false,
async move {
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); }
_ = self_clone.cancel.cancelled() => { return Ok(()); }
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
};
self_clone.eviction_task(cancel).await;
self_clone.eviction_task(parent).await;
Ok(())
},
);
}
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
use crate::tenant::tasks::random_init_delay;
// acquire the gate guard only once within a useful span
@@ -94,7 +94,7 @@ impl Timeline {
EvictionPolicy::OnlyImitiate(lat) => lat.period,
EvictionPolicy::NoEviction => Duration::from_secs(10),
};
if random_init_delay(period, &cancel).await.is_err() {
if random_init_delay(period, &self.cancel).await.is_err() {
return;
}
}
@@ -103,13 +103,13 @@ impl Timeline {
loop {
let policy = self.get_eviction_policy();
let cf = self
.eviction_iteration(&policy, &cancel, &guard, &ctx)
.eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
.await;
match cf {
ControlFlow::Break(()) => break,
ControlFlow::Continue(sleep_until) => {
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
.await
.is_ok()
{
@@ -123,6 +123,7 @@ impl Timeline {
#[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
async fn eviction_iteration(
self: &Arc<Self>,
tenant: &Tenant,
policy: &EvictionPolicy,
cancel: &CancellationToken,
gate: &GateGuard,
@@ -137,7 +138,7 @@ impl Timeline {
}
EvictionPolicy::LayerAccessThreshold(p) => {
match self
.eviction_iteration_threshold(p, cancel, gate, ctx)
.eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
.await
{
ControlFlow::Break(()) => return ControlFlow::Break(()),
@@ -146,7 +147,11 @@ impl Timeline {
(p.period, p.threshold)
}
EvictionPolicy::OnlyImitiate(p) => {
if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
if self
.imitiate_only(tenant, p, cancel, gate, ctx)
.await
.is_break()
{
return ControlFlow::Break(());
}
(p.period, p.threshold)
@@ -175,6 +180,7 @@ impl Timeline {
async fn eviction_iteration_threshold(
self: &Arc<Self>,
tenant: &Tenant,
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
gate: &GateGuard,
@@ -193,7 +199,10 @@ impl Timeline {
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
};
match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
match self
.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
.await
{
ControlFlow::Break(()) => return ControlFlow::Break(()),
ControlFlow::Continue(()) => (),
}
@@ -315,6 +324,7 @@ impl Timeline {
/// disk usage based eviction task.
async fn imitiate_only(
self: &Arc<Self>,
tenant: &Tenant,
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
gate: &GateGuard,
@@ -331,7 +341,8 @@ impl Timeline {
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
};
self.imitate_layer_accesses(p, cancel, gate, ctx).await
self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
.await
}
/// If we evict layers but keep cached values derived from those layers, then
@@ -361,6 +372,7 @@ impl Timeline {
#[instrument(skip_all)]
async fn imitate_layer_accesses(
&self,
tenant: &Tenant,
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
gate: &GateGuard,
@@ -396,17 +408,11 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
Ok(t) => t,
Err(_) => {
return ControlFlow::Break(());
}
};
let mut state = tenant.eviction_task_tenant_state.lock().await;
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
_ => {
self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
.await;
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
}
@@ -480,7 +486,7 @@ impl Timeline {
#[instrument(skip_all)]
async fn imitate_synthetic_size_calculation_worker(
&self,
tenant: &Arc<Tenant>,
tenant: &Tenant,
cancel: &CancellationToken,
ctx: &RequestContext,
) {

View File

@@ -86,6 +86,7 @@ impl<'t> UninitializedTimeline<'t> {
/// Prepares timeline data by loading it from the basebackup archive.
pub(crate) async fn import_basebackup_from_tar(
self,
tenant: Arc<Tenant>,
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
base_lsn: Lsn,
broker_client: storage_broker::BrokerClientChannel,
@@ -114,7 +115,7 @@ impl<'t> UninitializedTimeline<'t> {
// All the data has been imported. Insert the Timeline into the tenant's timelines map
let tl = self.finish_creation()?;
tl.activate(broker_client, None, ctx);
tl.activate(tenant, broker_client, None, ctx);
Ok(tl)
}

View File

@@ -24,26 +24,21 @@ mod connection_manager;
mod walreceiver_connection;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::timeline::walreceiver::connection_manager::{
connection_manager_loop_step, ConnectionManagerState,
};
use pageserver_api::shard::TenantShardId;
use std::future::Future;
use std::num::NonZeroU64;
use std::ops::ControlFlow;
use std::sync::Arc;
use std::time::Duration;
use storage_broker::BrokerClientChannel;
use tokio::select;
use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::TimelineId;
use self::connection_manager::ConnectionManagerStatus;
use super::Timeline;
@@ -62,9 +57,10 @@ pub struct WalReceiverConf {
}
pub struct WalReceiver {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
/// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
/// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
cancel: CancellationToken,
}
impl WalReceiver {
@@ -78,65 +74,58 @@ impl WalReceiver {
let timeline_id = timeline.timeline_id;
let walreceiver_ctx =
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
let loop_status = Arc::new(std::sync::RwLock::new(None));
let manager_status = Arc::clone(&loop_status);
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverManager,
Some(timeline.tenant_shard_id),
Some(timeline_id),
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
false,
let cancel = timeline.cancel.child_token();
WALRECEIVER_RUNTIME.spawn({
let cancel = cancel.clone();
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
// acquire timeline gate so we know the task doesn't outlive the Timeline
let Ok(_guard) = timeline.gate.enter() else {
debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
return;
};
debug!("WAL receiver manager started, connecting to broker");
let mut connection_manager_state = ConnectionManagerState::new(
timeline,
conf,
cancel.clone(),
);
loop {
select! {
_ = task_mgr::shutdown_watcher() => {
trace!("WAL receiver shutdown requested, shutting down");
while !cancel.is_cancelled() {
let loop_step_result = connection_manager_loop_step(
&mut broker_client,
&mut connection_manager_state,
&walreceiver_ctx,
&cancel,
&loop_status,
).await;
match loop_step_result {
Ok(()) => continue,
Err(_cancelled) => {
trace!("Connection manager loop ended, shutting down");
break;
},
loop_step_result = connection_manager_loop_step(
&mut broker_client,
&mut connection_manager_state,
&walreceiver_ctx,
&loop_status,
) => match loop_step_result {
ControlFlow::Continue(()) => continue,
ControlFlow::Break(()) => {
trace!("Connection manager loop ended, shutting down");
break;
}
},
}
}
}
connection_manager_state.shutdown().await;
*loop_status.write().unwrap() = None;
Ok(())
debug!("task exits");
}
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
);
});
Self {
tenant_shard_id,
timeline_id,
manager_status,
cancel,
}
}
pub async fn stop(self) {
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
#[instrument(skip_all, level = tracing::Level::DEBUG)]
pub fn cancel(&self) {
debug_assert_current_span_has_tenant_and_timeline_id();
debug!("cancelling walreceiver tasks");
self.cancel.cancel();
}
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
@@ -170,14 +159,18 @@ enum TaskStateUpdate<E> {
impl<E: Clone> TaskHandle<E> {
/// Initializes the task, starting it immediately after the creation.
///
/// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
/// It being a child token enables us to provide a [`Self::shutdown`] method.
fn spawn<Fut>(
cancel_parent: &CancellationToken,
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
) -> Self
where
Fut: Future<Output = anyhow::Result<()>> + Send,
E: Send + Sync + 'static,
{
let cancellation = CancellationToken::new();
let cancellation = cancel_parent.child_token();
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
let cancellation_clone = cancellation.clone();
@@ -197,6 +190,9 @@ impl<E: Clone> TaskHandle<E> {
}
}
/// # Cancel-Safety
///
/// Cancellation-safe.
async fn next_task_event(&mut self) -> TaskEvent<E> {
match self.events_receiver.changed().await {
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),

View File

@@ -17,7 +17,7 @@ use crate::metrics::{
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
};
use crate::task_mgr::{shutdown_token, TaskKind};
use crate::task_mgr::TaskKind;
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::{BrokerClientChannel, Code, Streaming};
use tokio::select;
use tokio_util::sync::CancellationToken;
use tracing::*;
use postgres_connection::PgConnectionConfig;
@@ -45,27 +45,33 @@ use super::{
TaskEvent, TaskHandle,
};
pub(crate) struct Cancelled;
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
/// If storage broker subscription is cancelled, exits.
///
/// # Cancel-Safety
///
/// Not cancellation-safe. Use `cancel` token to request cancellation.
pub(super) async fn connection_manager_loop_step(
broker_client: &mut BrokerClientChannel,
connection_manager_state: &mut ConnectionManagerState,
ctx: &RequestContext,
cancel: &CancellationToken,
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
) -> ControlFlow<(), ()> {
match connection_manager_state
.timeline
.wait_to_become_active(ctx)
.await
{
) -> Result<(), Cancelled> {
match tokio::select! {
_ = cancel.cancelled() => { return Err(Cancelled); },
st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
} {
Ok(()) => {}
Err(new_state) => {
debug!(
?new_state,
"state changed, stopping wal connection manager loop"
);
return ControlFlow::Break(());
return Err(Cancelled);
}
}
@@ -86,7 +92,7 @@ pub(super) async fn connection_manager_loop_step(
// Subscribe to the broker updates. Stream shares underlying TCP connection
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
debug!("Subscribed for broker timeline updates");
loop {
@@ -94,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(
// These things are happening concurrently:
//
// - cancellation request
// - keep receiving WAL on the current connection
// - if the shared state says we need to change connection, disconnect and return
// - this runs in a separate task and we receive updates via a watch channel
@@ -101,7 +108,11 @@ pub(super) async fn connection_manager_loop_step(
// - receive updates from broker
// - this might change the current desired connection
// - timeline state changes to something that does not allow walreceiver to run concurrently
select! {
// NB: make sure each of the select expressions are cancellation-safe
// (no need for arms to be cancellation-safe).
tokio::select! {
_ = cancel.cancelled() => { return Err(Cancelled); }
Some(wal_connection_update) = async {
match connection_manager_state.wal_connection.as_mut() {
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -133,7 +144,7 @@ pub(super) async fn connection_manager_loop_step(
},
// Got a new update from the broker
broker_update = broker_subscription.message() => {
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
match broker_update {
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
Err(status) => {
@@ -147,16 +158,17 @@ pub(super) async fn connection_manager_loop_step(
warn!("broker subscription failed: {status}");
}
}
return ControlFlow::Continue(());
return Ok(());
}
Ok(None) => {
error!("broker subscription stream ended"); // can't happen
return ControlFlow::Continue(());
return Ok(());
}
}
},
new_event = async {
// Reminder: this match arm needs to be cancellation-safe.
loop {
if connection_manager_state.timeline.current_state() == TimelineState::Loading {
warn!("wal connection manager should only be launched after timeline has become active");
@@ -182,11 +194,11 @@ pub(super) async fn connection_manager_loop_step(
}
} => match new_event {
ControlFlow::Continue(()) => {
return ControlFlow::Continue(());
return Ok(());
}
ControlFlow::Break(()) => {
debug!("Timeline is no longer active, stopping wal connection manager loop");
return ControlFlow::Break(());
return Err(Cancelled);
}
},
@@ -218,16 +230,15 @@ pub(super) async fn connection_manager_loop_step(
async fn subscribe_for_timeline_updates(
broker_client: &mut BrokerClientChannel,
id: TenantTimelineId,
) -> Streaming<SafekeeperTimelineInfo> {
cancel: &CancellationToken,
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
let mut attempt = 0;
let cancel = shutdown_token();
loop {
exponential_backoff(
attempt,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
&cancel,
cancel,
)
.await;
attempt += 1;
@@ -241,9 +252,14 @@ async fn subscribe_for_timeline_updates(
subscription_key: Some(key),
};
match broker_client.subscribe_safekeeper_info(request).await {
match {
tokio::select! {
r = broker_client.subscribe_safekeeper_info(request) => { r }
_ = cancel.cancelled() => { return Err(Cancelled); }
}
} {
Ok(resp) => {
return resp.into_inner();
return Ok(resp.into_inner());
}
Err(e) => {
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
@@ -264,6 +280,8 @@ pub(super) struct ConnectionManagerState {
id: TenantTimelineId,
/// Use pageserver data about the timeline to filter out some of the safekeepers.
timeline: Arc<Timeline>,
/// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
cancel: CancellationToken,
conf: WalReceiverConf,
/// Current connection to safekeeper for WAL streaming.
wal_connection: Option<WalConnection>,
@@ -386,7 +404,11 @@ struct BrokerSkTimeline {
}
impl ConnectionManagerState {
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
pub(super) fn new(
timeline: Arc<Timeline>,
conf: WalReceiverConf,
cancel: CancellationToken,
) -> Self {
let id = TenantTimelineId {
tenant_id: timeline.tenant_shard_id.tenant_id,
timeline_id: timeline.timeline_id,
@@ -394,6 +416,7 @@ impl ConnectionManagerState {
Self {
id,
timeline,
cancel,
conf,
wal_connection: None,
wal_stream_candidates: HashMap::new(),
@@ -401,6 +424,22 @@ impl ConnectionManagerState {
}
}
fn spawn<Fut>(
&self,
task: impl FnOnce(
tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
CancellationToken,
) -> Fut
+ Send
+ 'static,
) -> TaskHandle<WalConnectionStatus>
where
Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
{
// TODO: get rid of TaskHandle
super::TaskHandle::spawn(&self.cancel, task)
}
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
WALRECEIVER_SWITCHES
@@ -419,7 +458,7 @@ impl ConnectionManagerState {
);
let span = info_span!("connection", %node_id);
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
let connection_handle = self.spawn(move |events_sender, cancellation| {
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -447,6 +486,12 @@ impl ConnectionManagerState {
info!("walreceiver connection handling ended: {e}");
Ok(())
}
WalReceiverError::ClosedGate => {
info!(
"walreceiver connection handling ended because of closed gate"
);
Ok(())
}
WalReceiverError::Other(e) => {
// give out an error to have task_mgr give it a really verbose logging
if cancellation.is_cancelled() {
@@ -486,6 +531,10 @@ impl ConnectionManagerState {
/// Drops the current connection (if any) and updates retry timeout for the next
/// connection attempt to the same safekeeper.
///
/// # Cancel-Safety
///
/// Not cancellation-safe.
async fn drop_old_connection(&mut self, needs_shutdown: bool) {
let wal_connection = match self.wal_connection.take() {
Some(wal_connection) => wal_connection,
@@ -493,7 +542,14 @@ impl ConnectionManagerState {
};
if needs_shutdown {
wal_connection.connection_task.shutdown().await;
wal_connection
.connection_task
.shutdown()
// This here is why this function isn't cancellation-safe.
// If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
// Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
// and thus be ineffective.
.await;
}
let retry = self
@@ -838,6 +894,9 @@ impl ConnectionManagerState {
}
}
/// # Cancel-Safety
///
/// Not cancellation-safe.
pub(super) async fn shutdown(mut self) {
if let Some(wal_connection) = self.wal_connection.take() {
wal_connection.connection_task.shutdown().await;
@@ -986,7 +1045,7 @@ mod tests {
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
connection_task: state.spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();
@@ -1154,7 +1213,7 @@ mod tests {
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
connection_task: state.spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();
@@ -1221,7 +1280,7 @@ mod tests {
sk_id: NodeId(1),
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
connection_task: state.spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();
@@ -1285,7 +1344,7 @@ mod tests {
sk_id: NodeId(1),
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
connection_task: state.spawn(move |_, _| async move { Ok(()) }),
discovered_new_wal: Some(NewCommittedWAL {
discovered_at: time_over_threshold,
lsn: new_lsn,
@@ -1341,6 +1400,7 @@ mod tests {
timeline_id: TIMELINE_ID,
},
timeline,
cancel: CancellationToken::new(),
conf: WalReceiverConf {
wal_connect_timeout: Duration::from_secs(1),
lagging_wal_timeout: Duration::from_secs(1),
@@ -1384,7 +1444,7 @@ mod tests {
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
connection_task: state.spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();

View File

@@ -27,7 +27,6 @@ use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -37,8 +36,8 @@ use crate::{
use postgres_backend::is_expected_io_error;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use utils::pageserver_feedback::PageserverFeedback;
use utils::{id::NodeId, lsn::Lsn};
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
/// Status of the connection.
#[derive(Debug, Clone, Copy)]
@@ -68,6 +67,7 @@ pub(super) enum WalReceiverError {
SuccessfulCompletion(String),
/// Generic error
Other(anyhow::Error),
ClosedGate,
}
impl From<tokio_postgres::Error> for WalReceiverError {
@@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection(
) -> Result<(), WalReceiverError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// prevent timeline shutdown from finishing until we have exited
let _guard = timeline.gate.enter().map_err(|e| match e {
GateError::GateClosed => WalReceiverError::ClosedGate,
})?;
// This function spawns a side-car task (WalReceiverConnectionPoller).
// Get its gate guard now as well.
let poller_guard = timeline.gate.enter().map_err(|e| match e {
GateError::GateClosed => WalReceiverError::ClosedGate,
})?;
WALRECEIVER_STARTED_CONNECTIONS.inc();
// Connect to the database in replication mode.
@@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection(
}
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
// so spawn it off to run on its own. It shouldn't outlive this function, but,
// due to lack of async drop, we can't enforce that. However, we ensure that
// 1. it is sensitive to `cancellation` and
// 2. holds the Timeline gate open so that after timeline shutdown,
// we know this task is gone.
let _connection_ctx = ctx.detached_child(
TaskKind::WalReceiverConnectionPoller,
ctx.download_behavior(),
);
let connection_cancellation = cancellation.clone();
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverConnectionPoller,
Some(timeline.tenant_shard_id),
Some(timeline.timeline_id),
"walreceiver connection",
false,
WALRECEIVER_RUNTIME.spawn(
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
select! {
connection_result = connection => match connection_result {
Ok(()) => debug!("Walreceiver db connection closed"),
@@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection(
// with a similar error.
},
WalReceiverError::SuccessfulCompletion(_) => {}
WalReceiverError::ClosedGate => {
// doesn't happen at runtime
}
WalReceiverError::Other(err) => {
warn!("Connection aborted: {err:#}")
}
@@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection(
},
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
}
Ok(())
drop(poller_guard);
}
// Enrich the log lines emitted by this closure with meaningful context.
// TODO: technically, this task outlives the surrounding function, so, the
@@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection(
trace!("received XLogData between {startlsn} and {endlsn}");
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
waldecoder.feed_bytes(data);
{
@@ -389,17 +400,6 @@ pub(super) async fn handle_walreceiver_connection(
}
}
{
// This is a hack. It piggybacks on the keepalive messages sent by the
// safekeeper in order to enforce `checkpoint_timeout` on the currently
// open layer. This hack doesn't provide a bound on the total size of
// in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
let mut writer = timeline.writer().await;
if let Err(err) = writer.tick().await {
warn!("Timeline writer tick failed: {err}");
}
}
if let Some(last_lsn) = status_update {
let timeline_remote_consistent_lsn = timeline
.get_remote_consistent_lsn_visible()

View File

@@ -121,11 +121,16 @@ pub(super) enum SetDeletedFlagProgress {
Successful(NaiveDateTime),
}
pub(super) struct UploadQueueStopped {
pub(super) struct UploadQueueStoppedDeletable {
pub(super) upload_queue_for_deletion: UploadQueueInitialized,
pub(super) deleted_at: SetDeletedFlagProgress,
}
pub(super) enum UploadQueueStopped {
Deletable(UploadQueueStoppedDeletable),
Uninitialized,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum NotInitialized {
#[error("queue is in state Uninitialized")]
@@ -249,12 +254,15 @@ impl UploadQueue {
}
}
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> {
match self {
UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
anyhow::bail!("queue is in state {}", self.as_str())
}
UploadQueue::Stopped(stopped) => Ok(stopped),
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => {
anyhow::bail!("queue is in state Stopped(Uninitialized)")
}
UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable),
}
}
}

View File

@@ -61,7 +61,7 @@ pub struct VectoredRead {
}
impl VectoredRead {
fn size(&self) -> usize {
pub fn size(&self) -> usize {
(self.end - self.start) as usize
}
}

View File

@@ -782,7 +782,7 @@ where
}
}
// NB: don't use `buf.is_empty()` here; it is from the
// `impl Deref for Slice { Target = [u8] }`; the the &[u8]
// `impl Deref for Slice { Target = [u8] }`; the &[u8]
// returned by it only covers the initialized portion of `buf`.
// Whereas we're interested in ensuring that we filled the entire
// buffer that the user passed in.