Files
neon/control_plane/attachment_service/src/service.rs
John Spray 3bd2a4fd56 control_plane: avoid feedback loop with /location_config if compute hook fails. (#6668)
## Problem

The existing behavior isn't exactly incorrect, but is operationally
risky: if the control plane compute hook breaks, then all the control
plane operations trying to call /location_config will end up retrying
forever, which could put more load on the system.

## Summary of changes

- Treat 404s as fatal errors to do fewer retries: a 404 either indicates
we have the wrong URL, or some control plane bug is failing to recognize
our tenant ID as existing.
- Do not return an error on reconcilation errors in a non-creating
/location_config response: this allows the control plane to finish its
Operation (and we will eventually retry the compute notification later)
2024-02-07 19:14:18 +00:00

1680 lines
68 KiB
Rust

use std::{
collections::{BTreeMap, HashMap},
str::FromStr,
sync::Arc,
time::{Duration, Instant},
};
use control_plane::attachment_service::{
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use diesel::result::DatabaseErrorKind;
use futures::StreamExt;
use hyper::StatusCode;
use pageserver_api::{
control_api::{
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
ValidateResponse, ValidateResponseTenant,
},
models,
models::{
LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
TimelineCreateRequest, TimelineInfo,
},
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api;
use tokio_util::sync::CancellationToken;
use utils::{
completion::Barrier,
generation::Generation,
http::error::ApiError,
id::{NodeId, TenantId, TimelineId},
seqwait::SeqWait,
};
use crate::{
compute_hook::{self, ComputeHook},
node::Node,
persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
scheduler::Scheduler,
tenant_state::{
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
ReconcilerWaiter, TenantState,
},
PlacementPolicy, Sequence,
};
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
/// up on unresponsive pageservers and proceed.
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
// Top level state available to all HTTP handlers
struct ServiceState {
tenants: BTreeMap<TenantShardId, TenantState>,
nodes: Arc<HashMap<NodeId, Node>>,
compute_hook: Arc<ComputeHook>,
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
}
impl ServiceState {
fn new(
config: Config,
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
nodes: HashMap<NodeId, Node>,
tenants: BTreeMap<TenantShardId, TenantState>,
) -> Self {
Self {
tenants,
nodes: Arc::new(nodes),
compute_hook: Arc::new(ComputeHook::new(config)),
result_tx,
}
}
}
#[derive(Clone)]
pub struct Config {
// All pageservers managed by one instance of this service must have
// the same public key. This JWT token will be used to authenticate
// this service to the pageservers it manages.
pub jwt_token: Option<String>,
// This JWT token will be used to authenticate this service to the control plane.
pub control_plane_jwt_token: Option<String>,
/// Where the compute hook should send notifications of pageserver attachment locations
/// (this URL points to the control plane in prod). If this is None, the compute hook will
/// assume it is running in a test environment and try to update neon_local.
pub compute_hook_url: Option<String>,
}
impl From<DatabaseError> for ApiError {
fn from(err: DatabaseError) -> ApiError {
match err {
DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
// FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
ApiError::ShuttingDown
}
DatabaseError::Logical(reason) => {
ApiError::InternalServerError(anyhow::anyhow!(reason))
}
}
}
}
pub struct Service {
inner: Arc<std::sync::RwLock<ServiceState>>,
config: Config,
persistence: Arc<Persistence>,
/// This waits for initial reconciliation with pageservers to complete. Until this barrier
/// passes, it isn't safe to do any actions that mutate tenants.
pub(crate) startup_complete: Barrier,
}
impl From<ReconcileWaitError> for ApiError {
fn from(value: ReconcileWaitError) -> Self {
match value {
ReconcileWaitError::Shutdown => ApiError::ShuttingDown,
e @ ReconcileWaitError::Timeout(_) => ApiError::Timeout(format!("{e}").into()),
e @ ReconcileWaitError::Failed(..) => ApiError::InternalServerError(anyhow::anyhow!(e)),
}
}
}
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
}
/// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
/// until this is done.
async fn startup_reconcile(&self) {
// For all tenant shards, a vector of observed states on nodes (where None means
// indeterminate, same as in [`ObservedStateLocation`])
let mut observed = HashMap::new();
let nodes = {
let locked = self.inner.read().unwrap();
locked.nodes.clone()
};
// TODO: issue these requests concurrently
for node in nodes.values() {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
tracing::info!("Scanning shards on node {}...", node.id);
match client.list_location_config().await {
Err(e) => {
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
// TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
// pageserver is being restarted at the same time as we are
}
Ok(listing) => {
tracing::info!(
"Received {} shard statuses from pageserver {}, setting it to Active",
listing.tenant_shards.len(),
node.id
);
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
observed.insert(tenant_shard_id, (node.id, conf_opt));
}
}
}
}
let mut cleanup = Vec::new();
let mut compute_notifications = Vec::new();
// Populate intent and observed states for all tenants, based on reported state on pageservers
let shard_count = {
let mut locked = self.inner.write().unwrap();
for (tenant_shard_id, (node_id, observed_loc)) in observed {
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id));
continue;
};
tenant_state
.observed
.locations
.insert(node_id, ObservedStateLocation { conf: observed_loc });
}
// Populate each tenant's intent state
let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
tenant_state.intent_from_observed();
if let Err(e) = tenant_state.schedule(&mut scheduler) {
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
// not enough pageservers are available. The tenant may well still be available
// to clients.
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
} else {
// If we're both intending and observed to be attached at a particular node, we will
// emit a compute notification for this. In the case where our observed state does not
// yet match our intent, we will eventually reconcile, and that will emit a compute notification.
if let Some(attached_at) = tenant_state.stably_attached() {
compute_notifications.push((*tenant_shard_id, attached_at));
}
}
}
locked.tenants.len()
};
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
// generation_pageserver in the database.
// Clean up any tenants that were found on pageservers but are not known to us.
for (tenant_shard_id, node_id) in cleanup {
// A node reported a tenant_shard_id which is unknown to us: detach it.
let node = nodes
.get(&node_id)
.expect("Always exists: only known nodes are scanned");
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
match client
.location_config(
tenant_shard_id,
LocationConfig {
mode: LocationConfigMode::Detached,
generation: None,
secondary_conf: None,
shard_number: tenant_shard_id.shard_number.0,
shard_count: tenant_shard_id.shard_count.0,
shard_stripe_size: 0,
tenant_conf: models::TenantConfig::default(),
},
None,
)
.await
{
Ok(()) => {
tracing::info!(
"Detached unknown shard {tenant_shard_id} on pageserver {node_id}"
);
}
Err(e) => {
// Non-fatal error: leaving a tenant shard behind that we are not managing shouldn't
// break anything.
tracing::error!(
"Failed to detach unknkown shard {tenant_shard_id} on pageserver {node_id}: {e}"
);
}
}
}
// Emit compute hook notifications for all tenants which are already stably attached. Other tenants
// will emit compute hook notifications when they reconcile.
//
// Ordering: we must complete these notification attempts before doing any other reconciliation for the
// tenants named here, because otherwise our calls to notify() might race with more recent values
// generated by reconciliation.
// Compute notify is fallible. If it fails here, do not delay overall startup: set the
// flag on these shards that they have a pending notification.
let compute_hook = self.inner.read().unwrap().compute_hook.clone();
// Construct an async stream of futures to invoke the compute notify function: we do this
// in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
let stream = futures::stream::iter(compute_notifications.into_iter())
.map(|(tenant_shard_id, node_id)| {
let compute_hook = compute_hook.clone();
async move {
// TODO: give Service a cancellation token for clean shutdown
let cancel = CancellationToken::new();
if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
tracing::error!(
tenant_shard_id=%tenant_shard_id,
node_id=%node_id,
"Failed to notify compute on startup for shard: {e}"
);
Some(tenant_shard_id)
} else {
None
}
}
})
.buffered(compute_hook::API_CONCURRENCY);
let notify_results = stream.collect::<Vec<_>>().await;
// Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
{
let mut locked = self.inner.write().unwrap();
for tenant_shard_id in notify_results.into_iter().flatten() {
if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
shard.pending_compute_notification = true;
}
}
}
// Finally, now that the service is up and running, launch reconcile operations for any tenants
// which require it: under normal circumstances this should only include tenants that were in some
// transient state before we restarted, or any tenants whose compute hooks failed above.
let reconcile_tasks = self.reconcile_all();
// We will not wait for these reconciliation tasks to run here: we're now done with startup and
// normal operations may proceed.
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
}
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
tracing::info!("Loading nodes from database...");
let nodes = persistence.list_nodes().await?;
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
tracing::info!("Loaded {} nodes from database.", nodes.len());
tracing::info!("Loading shards from database...");
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
tracing::info!(
"Loaded {} shards from database.",
tenant_shard_persistence.len()
);
let mut tenants = BTreeMap::new();
for tsp in tenant_shard_persistence {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
let shard_identity = if tsp.shard_count == 0 {
ShardIdentity::unsharded()
} else {
ShardIdentity::new(
ShardNumber(tsp.shard_number as u8),
ShardCount(tsp.shard_count as u8),
ShardStripeSize(tsp.shard_stripe_size as u32),
)?
};
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new();
if tsp.generation_pageserver != i64::MAX {
intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
}
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: Generation::new(tsp.generation as u32),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
pending_compute_notification: false,
};
tenants.insert(tenant_shard_id, new_tenant);
}
let (startup_completion, startup_complete) = utils::completion::channel();
let this = Arc::new(Self {
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
config.clone(),
result_tx,
nodes,
tenants,
))),
config,
persistence,
startup_complete,
});
let result_task_this = this.clone();
tokio::task::spawn(async move {
while let Some(result) = result_rx.recv().await {
tracing::info!(
"Reconcile result for sequence {}, ok={}",
result.sequence,
result.result.is_ok()
);
let mut locked = result_task_this.inner.write().unwrap();
let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
// A reconciliation result might race with removing a tenant: drop results for
// tenants that aren't in our map.
continue;
};
// Usually generation should only be updated via this path, so the max() isn't
// needed, but it is used to handle out-of-band updates via. e.g. test hook.
tenant.generation = std::cmp::max(tenant.generation, result.generation);
// If the reconciler signals that it failed to notify compute, set this state on
// the shard so that a future [`TenantState::maybe_reconcile`] will try again.
tenant.pending_compute_notification = result.pending_compute_notification;
match result.result {
Ok(()) => {
for (node_id, loc) in &result.observed.locations {
if let Some(conf) = &loc.conf {
tracing::info!(
"Updating observed location {}: {:?}",
node_id,
conf
);
} else {
tracing::info!("Setting observed location {} to None", node_id,)
}
}
tenant.observed = result.observed;
tenant.waiter.advance(result.sequence);
}
Err(e) => {
tracing::warn!(
"Reconcile error on tenant {}: {}",
tenant.tenant_shard_id,
e
);
// Ordering: populate last_error before advancing error_seq,
// so that waiters will see the correct error after waiting.
*(tenant.last_error.lock().unwrap()) = format!("{e}");
tenant.error_waiter.advance(result.sequence);
for (node_id, o) in result.observed.locations {
tenant.observed.locations.insert(node_id, o);
}
}
}
}
});
let startup_reconcile_this = this.clone();
tokio::task::spawn(async move {
// Block the [`Service::startup_complete`] barrier until we're done
let _completion = startup_completion;
startup_reconcile_this.startup_reconcile().await
});
Ok(this)
}
pub(crate) async fn attach_hook(
&self,
attach_req: AttachHookRequest,
) -> anyhow::Result<AttachHookResponse> {
// This is a test hook. To enable using it on tenants that were created directly with
// the pageserver API (not via this service), we will auto-create any missing tenant
// shards with default state.
let insert = {
let locked = self.inner.write().unwrap();
!locked.tenants.contains_key(&attach_req.tenant_shard_id)
};
if insert {
let tsp = TenantShardPersistence {
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
shard_stripe_size: 0,
generation: 0,
generation_pageserver: i64::MAX,
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
};
match self.persistence.insert_tenant_shards(vec![tsp]).await {
Err(e) => match e {
DatabaseError::Query(diesel::result::Error::DatabaseError(
DatabaseErrorKind::UniqueViolation,
_,
)) => {
tracing::info!(
"Raced with another request to insert tenant {}",
attach_req.tenant_shard_id
)
}
_ => return Err(e.into()),
},
Ok(()) => {
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(
attach_req.tenant_shard_id,
TenantState::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Single,
),
);
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
}
}
}
let new_generation = if let Some(req_node_id) = attach_req.node_id {
Some(
self.persistence
.increment_generation(attach_req.tenant_shard_id, req_node_id)
.await?,
)
} else {
self.persistence.detach(attach_req.tenant_shard_id).await?;
None
};
let mut locked = self.inner.write().unwrap();
let tenant_state = locked
.tenants
.get_mut(&attach_req.tenant_shard_id)
.expect("Checked for existence above");
if let Some(new_generation) = new_generation {
tenant_state.generation = new_generation;
} else {
// This is a detach notification. We must update placement policy to avoid re-attaching
// during background scheduling/reconciliation, or during attachment service restart.
assert!(attach_req.node_id.is_none());
tenant_state.policy = PlacementPolicy::Detached;
}
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
ps_id = %attaching_pageserver,
generation = ?tenant_state.generation,
"issuing",
);
} else if let Some(ps_id) = tenant_state.intent.attached {
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
%ps_id,
generation = ?tenant_state.generation,
"dropping",
);
} else {
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
"no-op: tenant already has no pageserver");
}
tenant_state.intent.attached = attach_req.node_id;
tracing::info!(
"attach_hook: tenant {} set generation {:?}, pageserver {}",
attach_req.tenant_shard_id,
tenant_state.generation,
// TODO: this is an odd number of 0xf's
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
);
Ok(AttachHookResponse {
gen: attach_req
.node_id
.map(|_| tenant_state.generation.into().unwrap()),
})
}
pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse {
let locked = self.inner.read().unwrap();
let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);
InspectResponse {
attachment: tenant_state.and_then(|s| {
s.intent
.attached
.map(|ps| (s.generation.into().unwrap(), ps))
}),
}
}
pub(crate) async fn re_attach(
&self,
reattach_req: ReAttachRequest,
) -> anyhow::Result<ReAttachResponse> {
// Ordering: we must persist generation number updates before making them visible in the in-memory state
let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
// Apply the updated generation to our in-memory state
let mut locked = self.inner.write().unwrap();
let mut response = ReAttachResponse {
tenants: Vec::new(),
};
for (tenant_shard_id, new_gen) in incremented_generations {
response.tenants.push(ReAttachResponseTenant {
id: tenant_shard_id,
gen: new_gen.into().unwrap(),
});
// Apply the new generation number to our in-memory state
let shard_state = locked.tenants.get_mut(&tenant_shard_id);
let Some(shard_state) = shard_state else {
// Not fatal. This edge case requires a re-attach to happen
// between inserting a new tenant shard in to the database, and updating our in-memory
// state to know about the shard, _and_ that the state inserted to the database referenced
// a pageserver. Should never happen, but handle it rather than panicking, since it should
// be harmless.
tracing::error!(
"Shard {} is in database for node {} but not in-memory state",
tenant_shard_id,
reattach_req.node_id
);
continue;
};
shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
// to call location_conf API with an old generation. Wait for cancellation to complete
// before responding to this request. Requires well implemented CancellationToken logic
// all the way to where we call location_conf. Even then, there can still be a location_conf
// request in flight over the network: TODO handle that by making location_conf API refuse
// to go backward in generations.
}
Ok(response)
}
pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse {
let locked = self.inner.read().unwrap();
let mut response = ValidateResponse {
tenants: Vec::new(),
};
for req_tenant in validate_req.tenants {
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_state.generation == Generation::new(req_tenant.gen);
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
req_tenant.id,
req_tenant.gen,
tenant_state.generation
);
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
valid,
});
} else {
// After tenant deletion, we may approve any validation. This avoids
// spurious warnings on the pageserver if it has pending LSN updates
// at the point a deletion happens.
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
valid: true,
});
}
}
response
}
pub(crate) async fn tenant_create(
&self,
create_req: TenantCreateRequest,
) -> Result<TenantCreateResponse, ApiError> {
// Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded")
let literal_shard_count = if create_req.shard_parameters.is_unsharded() {
1
} else {
create_req.shard_parameters.count.0
};
// This service expects to handle sharding itself: it is an error to try and directly create
// a particular shard here.
let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Attempted to create a specific shard, this API is for creating the whole tenant"
)));
} else {
create_req.new_tenant_id.tenant_id
};
tracing::info!(
"Creating tenant {}, shard_count={:?}",
create_req.new_tenant_id,
create_req.shard_parameters.count,
);
let create_ids = (0..literal_shard_count)
.map(|i| TenantShardId {
tenant_id,
shard_number: ShardNumber(i),
shard_count: create_req.shard_parameters.count,
})
.collect::<Vec<_>>();
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
// have no expectation of HA).
let placement_policy: PlacementPolicy = PlacementPolicy::Single;
// Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller
// to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
// during the creation, rather than risking leaving orphan objects in S3.
let persist_tenant_shards = create_ids
.iter()
.map(|tenant_shard_id| TenantShardPersistence {
tenant_id: tenant_shard_id.tenant_id.to_string(),
shard_number: tenant_shard_id.shard_number.0 as i32,
shard_count: tenant_shard_id.shard_count.0 as i32,
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
generation_pageserver: i64::MAX,
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
config: serde_json::to_string(&create_req.config).unwrap(),
})
.collect();
self.persistence
.insert_tenant_shards(persist_tenant_shards)
.await
.map_err(|e| {
// TODO: distinguish primary key constraint (idempotent, OK), from other errors
ApiError::InternalServerError(anyhow::anyhow!(e))
})?;
let (waiters, response_shards) = {
let mut locked = self.inner.write().unwrap();
let mut response_shards = Vec::new();
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
for tenant_shard_id in create_ids {
tracing::info!("Creating shard {tenant_shard_id}...");
use std::collections::btree_map::Entry;
match locked.tenants.entry(tenant_shard_id) {
Entry::Occupied(mut entry) => {
tracing::info!(
"Tenant shard {tenant_shard_id} already exists while creating"
);
// TODO: schedule() should take an anti-affinity expression that pushes
// attached and secondary locations (independently) away frorm those
// pageservers also holding a shard for this tenant.
entry.get_mut().schedule(&mut scheduler).map_err(|e| {
ApiError::Conflict(format!(
"Failed to schedule shard {tenant_shard_id}: {e}"
))
})?;
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: entry
.get()
.intent
.attached
.expect("We just set pageserver if it was None"),
generation: entry.get().generation.into().unwrap(),
});
continue;
}
Entry::Vacant(entry) => {
let mut state = TenantState::new(
tenant_shard_id,
ShardIdentity::from_params(
tenant_shard_id.shard_number,
&create_req.shard_parameters,
),
placement_policy.clone(),
);
if let Some(create_gen) = create_req.generation {
state.generation = Generation::new(create_gen);
}
state.config = create_req.config.clone();
state.schedule(&mut scheduler).map_err(|e| {
ApiError::Conflict(format!(
"Failed to schedule shard {tenant_shard_id}: {e}"
))
})?;
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: state
.intent
.attached
.expect("We just set pageserver if it was None"),
generation: state.generation.into().unwrap(),
});
entry.insert(state)
}
};
}
// Take a snapshot of pageservers
let pageservers = locked.nodes.clone();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let waiters = locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
.filter_map(|(_shard_id, shard)| {
shard.maybe_reconcile(
result_tx.clone(),
&pageservers,
&compute_hook,
&self.config,
&self.persistence,
)
})
.collect::<Vec<_>>();
(waiters, response_shards)
};
self.await_waiters(waiters).await?;
Ok(TenantCreateResponse {
shards: response_shards,
})
}
/// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
/// wait for reconciliation to complete before responding.
async fn await_waiters(
&self,
waiters: Vec<ReconcilerWaiter>,
) -> Result<(), ReconcileWaitError> {
let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
for waiter in waiters {
let timeout = deadline.duration_since(Instant::now());
waiter.wait_timeout(timeout).await?;
}
Ok(())
}
/// This API is used by the cloud control plane to do coarse-grained control of tenants:
/// - Call with mode Attached* to upsert the tenant.
/// - Call with mode Detached to switch to PolicyMode::Detached
///
/// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
/// secondary locations.
pub(crate) async fn tenant_location_config(
&self,
tenant_id: TenantId,
req: TenantLocationConfigRequest,
) -> Result<TenantLocationConfigResponse, ApiError> {
if req.tenant_id.shard_count.0 > 1 {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"This API is for importing single-sharded or unsharded tenants"
)));
}
let mut waiters = Vec::new();
let mut result = TenantLocationConfigResponse { shards: Vec::new() };
let maybe_create = {
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let pageservers = locked.nodes.clone();
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
// Maybe we have existing shards
let mut create = true;
for (shard_id, shard) in locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
{
// Saw an existing shard: this is not a creation
create = false;
// Note that for existing tenants we do _not_ respect the generation in the request: this is likely
// to be stale. Once a tenant is created in this service, our view of generation is authoritative, and
// callers' generations may be ignored. This represents a one-way migration of tenants from the outer
// cloud control plane into this service.
// Use location config mode as an indicator of policy: if they ask for
// attached we go to default HA attached mode. If they ask for secondary
// we go to secondary-only mode. If they ask for detached we detach.
match req.config.mode {
LocationConfigMode::Detached => {
shard.policy = PlacementPolicy::Detached;
}
LocationConfigMode::Secondary => {
// TODO: implement secondary-only mode.
todo!();
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// TODO: persistence for changes in policy
if pageservers.len() > 1 {
shard.policy = PlacementPolicy::Double(1)
} else {
// Convenience for dev/test: if we just have one pageserver, import
// tenants into Single mode so that scheduling will succeed.
shard.policy = PlacementPolicy::Single
}
}
}
shard.schedule(&mut scheduler)?;
let maybe_waiter = shard.maybe_reconcile(
result_tx.clone(),
&pageservers,
&compute_hook,
&self.config,
&self.persistence,
);
if let Some(waiter) = maybe_waiter {
waiters.push(waiter);
}
if let Some(node_id) = shard.intent.attached {
result.shards.push(TenantShardLocation {
shard_id: *shard_id,
node_id,
})
}
}
if create {
// Validate request mode
match req.config.mode {
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
// When using this API to onboard an existing tenant to this service, it must start in
// an attached state, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Imported tenant must be in attached mode"
)));
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// Pass
}
}
// Validate request generation
let Some(generation) = req.config.generation else {
// We can only import attached tenants, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Generation is mandatory when importing tenant"
)));
};
// Synthesize a creation request
Some(TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: Some(generation),
shard_parameters: ShardParameters {
// Must preserve the incoming shard_count do distinguish unsharded (0)
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
count: req.tenant_id.shard_count,
// We only import un-sharded or single-sharded tenants, so stripe
// size can be made up arbitrarily here.
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
},
config: req.config.tenant_conf,
})
} else {
None
}
};
if let Some(create_req) = maybe_create {
let create_resp = self.tenant_create(create_req).await?;
result.shards = create_resp
.shards
.into_iter()
.map(|s| TenantShardLocation {
node_id: s.node_id,
shard_id: s.shard_id,
})
.collect();
} else {
// This was an update, wait for reconciliation
if let Err(e) = self.await_waiters(waiters).await {
// Do not treat a reconcile error as fatal: we have already applied any requested
// Intent changes, and the reconcile can fail for external reasons like unavailable
// compute notification API. In these cases, it is important that we do not
// cause the cloud control plane to retry forever on this API.
tracing::warn!(
"Failed to reconcile after /location_config: {e}, returning success anyway"
);
}
}
Ok(result)
}
pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
// TODO: refactor into helper
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard.intent.attached.ok_or_else(|| {
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
})?;
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
targets
};
// TODO: error out if the tenant is not attached anywhere.
// Phase 1: delete on the pageservers
let mut any_pending = false;
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
// surface immediately as an error to our caller.
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting shard {tenant_shard_id} on node {}: {e}",
node.id
))
})?;
tracing::info!(
"Shard {tenant_shard_id} on node {}, delete returned {}",
node.id,
status
);
if status == StatusCode::ACCEPTED {
any_pending = true;
}
}
if any_pending {
// Caller should call us again later. When we eventually see 404s from
// all the shards, we may proceed to delete our records of the tenant.
tracing::info!(
"Tenant {} has some shards pending deletion, returning 202",
tenant_id
);
return Ok(StatusCode::ACCEPTED);
}
// Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
// our in-memory state and database state.
// Ordering: we delete persistent state first: if we then
// crash, we will drop the in-memory state.
// Drop persistent state.
self.persistence.delete_tenant(tenant_id).await?;
// Drop in-memory state
{
let mut locked = self.inner.write().unwrap();
locked
.tenants
.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
tracing::info!(
"Deleted tenant {tenant_id}, now have {} tenants",
locked.tenants.len()
);
};
// Success is represented as 404, to imitate the existing pageserver deletion API
Ok(StatusCode::NOT_FOUND)
}
pub(crate) async fn tenant_timeline_create(
&self,
tenant_id: TenantId,
mut create_req: TimelineCreateRequest,
) -> Result<TimelineInfo, ApiError> {
let mut timeline_info = None;
tracing::info!(
"Creating timeline {}/{}",
tenant_id,
create_req.new_timeline_id,
);
self.ensure_attached_wait(tenant_id).await?;
// TODO: refuse to do this if shard splitting is in progress
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard.intent.attached.ok_or_else(|| {
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
})?;
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
targets
};
if targets.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant not found").into(),
));
}
for (tenant_shard_id, node) in targets {
// TODO: issue shard timeline creates in parallel, once the 0th is done.
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
tracing::info!(
"Creating timeline on shard {}/{}, attached to node {}",
tenant_shard_id,
create_req.new_timeline_id,
node.id
);
let shard_timeline_info = client
.timeline_create(tenant_shard_id, &create_req)
.await
.map_err(|e| match e {
mgmt_api::Error::ApiError(status, msg)
if status == StatusCode::INTERNAL_SERVER_ERROR
|| status == StatusCode::NOT_ACCEPTABLE =>
{
// TODO: handle more error codes, e.g. 503 should be passed through. Make a general wrapper
// for pass-through API calls.
ApiError::InternalServerError(anyhow::anyhow!(msg))
}
_ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
})?;
if timeline_info.is_none() {
// If the caller specified an ancestor but no ancestor LSN, we are responsible for
// propagating the LSN chosen by the first shard to the other shards: it is important
// that all shards end up with the same ancestor_start_lsn.
if create_req.ancestor_timeline_id.is_some()
&& create_req.ancestor_start_lsn.is_none()
{
create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn;
}
// We will return the TimelineInfo from the first shard
timeline_info = Some(shard_timeline_info);
}
}
Ok(timeline_info.expect("targets cannot be empty"))
}
pub(crate) async fn tenant_timeline_delete(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<StatusCode, ApiError> {
tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
self.ensure_attached_wait(tenant_id).await?;
// TODO: refuse to do this if shard splitting is in progress
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard.intent.attached.ok_or_else(|| {
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
})?;
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
targets
};
if targets.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant not found").into(),
));
}
// TODO: call into shards concurrently
let mut any_pending = false;
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
tracing::info!(
"Deleting timeline on shard {}/{}, attached to node {}",
tenant_shard_id,
timeline_id,
node.id
);
let status = client
.timeline_delete(tenant_shard_id, timeline_id)
.await
.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
node.id
))
})?;
if status == StatusCode::ACCEPTED {
any_pending = true;
}
}
if any_pending {
Ok(StatusCode::ACCEPTED)
} else {
Ok(StatusCode::NOT_FOUND)
}
}
/// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
/// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound)
pub(crate) fn tenant_shard0_baseurl(
&self,
tenant_id: TenantId,
) -> Result<(String, TenantShardId), ApiError> {
let locked = self.inner.read().unwrap();
let Some((tenant_shard_id, shard)) = locked
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.next()
else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
));
};
// TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
// point to somewhere we haven't attached yet.
let Some(node_id) = shard.intent.attached else {
return Err(ApiError::Conflict(
"Cannot call timeline API on non-attached tenant".to_string(),
));
};
let Some(node) = locked.nodes.get(&node_id) else {
// This should never happen
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Shard refers to nonexistent node"
)));
};
Ok((node.base_url(), *tenant_shard_id))
}
pub(crate) fn tenant_locate(
&self,
tenant_id: TenantId,
) -> Result<TenantLocateResponse, ApiError> {
let locked = self.inner.read().unwrap();
tracing::info!("Locating shards for tenant {tenant_id}");
// Take a snapshot of pageservers
let pageservers = locked.nodes.clone();
let mut result = Vec::new();
let mut shard_params: Option<ShardParameters> = None;
for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard
.intent
.attached
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
"Cannot locate a tenant that is not attached"
)))?;
let node = pageservers
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
result.push(TenantLocateResponseShard {
shard_id: *tenant_shard_id,
node_id,
listen_http_addr: node.listen_http_addr.clone(),
listen_http_port: node.listen_http_port,
listen_pg_addr: node.listen_pg_addr.clone(),
listen_pg_port: node.listen_pg_port,
});
match &shard_params {
None => {
shard_params = Some(ShardParameters {
stripe_size: shard.shard.stripe_size,
count: shard.shard.count,
});
}
Some(params) => {
if params.stripe_size != shard.shard.stripe_size {
// This should never happen. We enforce at runtime because it's simpler than
// adding an extra per-tenant data structure to store the things that should be the same
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Inconsistent shard stripe size parameters!"
)));
}
}
}
}
if result.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("No shards for this tenant ID found").into(),
));
}
let shard_params = shard_params.expect("result is non-empty, therefore this is set");
tracing::info!(
"Located tenant {} with params {:?} on shards {}",
tenant_id,
shard_params,
result
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
Ok(TenantLocateResponse {
shards: result,
shard_params,
})
}
pub(crate) async fn tenant_shard_migrate(
&self,
tenant_shard_id: TenantShardId,
migrate_req: TenantShardMigrateRequest,
) -> Result<TenantShardMigrateResponse, ApiError> {
let waiter = {
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let pageservers = locked.nodes.clone();
let compute_hook = locked.compute_hook.clone();
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant shard not found").into(),
));
};
if shard.intent.attached == Some(migrate_req.node_id) {
// No-op case: we will still proceed to wait for reconciliation in case it is
// incomplete from an earlier update to the intent.
tracing::info!("Migrating: intent is unchanged {:?}", shard.intent);
} else {
let old_attached = shard.intent.attached;
match shard.policy {
PlacementPolicy::Single => {
shard.intent.secondary.clear();
}
PlacementPolicy::Double(_n) => {
// If our new attached node was a secondary, it no longer should be.
shard.intent.secondary.retain(|s| s != &migrate_req.node_id);
// If we were already attached to something, demote that to a secondary
if let Some(old_attached) = old_attached {
shard.intent.secondary.push(old_attached);
}
}
PlacementPolicy::Detached => {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
)))
}
}
shard.intent.attached = Some(migrate_req.node_id);
tracing::info!("Migrating: new intent {:?}", shard.intent);
shard.sequence = shard.sequence.next();
}
shard.maybe_reconcile(
result_tx,
&pageservers,
&compute_hook,
&self.config,
&self.persistence,
)
};
if let Some(waiter) = waiter {
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
} else {
tracing::warn!("Migration is a no-op");
}
Ok(TenantShardMigrateResponse {})
}
pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
// It is convenient to avoid taking the big lock and converting Node to a serializable
// structure, by fetching from storage instead of reading in-memory state.
let nodes = self
.persistence
.list_nodes()
.await?
.into_iter()
.map(|n| n.to_persistent())
.collect();
Ok(nodes)
}
pub(crate) async fn node_register(
&self,
register_req: NodeRegisterRequest,
) -> Result<(), ApiError> {
// Pre-check for an already-existing node
{
let locked = self.inner.read().unwrap();
if let Some(node) = locked.nodes.get(&register_req.node_id) {
// Note that we do not do a total equality of the struct, because we don't require
// the availability/scheduling states to agree for a POST to be idempotent.
if node.listen_http_addr == register_req.listen_http_addr
&& node.listen_http_port == register_req.listen_http_port
&& node.listen_pg_addr == register_req.listen_pg_addr
&& node.listen_pg_port == register_req.listen_pg_port
{
tracing::info!(
"Node {} re-registered with matching address",
register_req.node_id
);
return Ok(());
} else {
// TODO: decide if we want to allow modifying node addresses without removing and re-adding
// the node. Safest/simplest thing is to refuse it, and usually we deploy with
// a fixed address through the lifetime of a node.
tracing::warn!(
"Node {} tried to register with different address",
register_req.node_id
);
return Err(ApiError::Conflict(
"Node is already registered with different address".to_string(),
));
}
}
}
// Ordering: we must persist the new node _before_ adding it to in-memory state.
// This ensures that before we use it for anything or expose it via any external
// API, it is guaranteed to be available after a restart.
let new_node = Node {
id: register_req.node_id,
listen_http_addr: register_req.listen_http_addr,
listen_http_port: register_req.listen_http_port,
listen_pg_addr: register_req.listen_pg_addr,
listen_pg_port: register_req.listen_pg_port,
scheduling: NodeSchedulingPolicy::Filling,
// TODO: we shouldn't really call this Active until we've heartbeated it.
availability: NodeAvailability::Active,
};
// TODO: idempotency if the node already exists in the database
self.persistence.insert_node(&new_node).await?;
let mut locked = self.inner.write().unwrap();
let mut new_nodes = (*locked.nodes).clone();
new_nodes.insert(register_req.node_id, new_node);
locked.nodes = Arc::new(new_nodes);
tracing::info!(
"Registered pageserver {}, now have {} pageservers",
register_req.node_id,
locked.nodes.len()
);
Ok(())
}
pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> {
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let mut new_nodes = (*locked.nodes).clone();
let Some(node) = new_nodes.get_mut(&config_req.node_id) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Node not registered").into(),
));
};
let mut offline_transition = false;
let mut active_transition = false;
if let Some(availability) = &config_req.availability {
match (availability, &node.availability) {
(NodeAvailability::Offline, NodeAvailability::Active) => {
tracing::info!("Node {} transition to offline", config_req.node_id);
offline_transition = true;
}
(NodeAvailability::Active, NodeAvailability::Offline) => {
tracing::info!("Node {} transition to active", config_req.node_id);
active_transition = true;
}
_ => {
tracing::info!("Node {} no change during config", config_req.node_id);
// No change
}
};
node.availability = *availability;
}
if let Some(scheduling) = config_req.scheduling {
node.scheduling = scheduling;
// TODO: once we have a background scheduling ticker for fill/drain, kick it
// to wake up and start working.
}
let new_nodes = Arc::new(new_nodes);
let mut scheduler = Scheduler::new(&locked.tenants, &new_nodes);
if offline_transition {
for (tenant_shard_id, tenant_state) in &mut locked.tenants {
if let Some(observed_loc) =
tenant_state.observed.locations.get_mut(&config_req.node_id)
{
// When a node goes offline, we set its observed configuration to None, indicating unknown: we will
// not assume our knowledge of the node's configuration is accurate until it comes back online
observed_loc.conf = None;
}
if tenant_state.intent.notify_offline(config_req.node_id) {
tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(&mut scheduler) {
Err(e) => {
// It is possible that some tenants will become unschedulable when too many pageservers
// go offline: in this case there isn't much we can do other than make the issue observable.
// TODO: give TenantState a scheduling error attribute to be queried later.
tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
}
Ok(()) => {
tenant_state.maybe_reconcile(
result_tx.clone(),
&new_nodes,
&compute_hook,
&self.config,
&self.persistence,
);
}
}
}
}
}
if active_transition {
// When a node comes back online, we must reconcile any tenant that has a None observed
// location on the node.
for tenant_state in locked.tenants.values_mut() {
if let Some(observed_loc) =
tenant_state.observed.locations.get_mut(&config_req.node_id)
{
if observed_loc.conf.is_none() {
tenant_state.maybe_reconcile(
result_tx.clone(),
&new_nodes,
&compute_hook,
&self.config,
&self.persistence,
);
}
}
}
// TODO: in the background, we should balance work back onto this pageserver
}
locked.nodes = new_nodes;
Ok(())
}
/// Helper for methods that will try and call pageserver APIs for
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
/// is attached somewhere.
fn ensure_attached_schedule(
&self,
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
tenant_id: TenantId,
) -> Result<Vec<ReconcilerWaiter>, anyhow::Error> {
let mut waiters = Vec::new();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
let pageservers = locked.nodes.clone();
for (_tenant_shard_id, shard) in locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
{
shard.schedule(&mut scheduler)?;
if let Some(waiter) = shard.maybe_reconcile(
result_tx.clone(),
&pageservers,
&compute_hook,
&self.config,
&self.persistence,
) {
waiters.push(waiter);
}
}
Ok(waiters)
}
async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
let ensure_waiters = {
let locked = self.inner.write().unwrap();
self.ensure_attached_schedule(locked, tenant_id)
.map_err(ApiError::InternalServerError)?
};
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
for waiter in ensure_waiters {
let timeout = deadline.duration_since(Instant::now());
waiter.wait_timeout(timeout).await?;
}
Ok(())
}
/// Check all tenants for pending reconciliation work, and reconcile those in need
///
/// Returns how many reconciliation tasks were started
fn reconcile_all(&self) -> usize {
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let pageservers = locked.nodes.clone();
locked
.tenants
.iter_mut()
.filter_map(|(_tenant_shard_id, shard)| {
shard.maybe_reconcile(
result_tx.clone(),
&pageservers,
&compute_hook,
&self.config,
&self.persistence,
)
})
.count()
}
}