Merge remote-tracking branch 'origin/main' into communicator-rewrite

This commit is contained in:
Heikki Linnakangas
2025-07-05 16:59:51 +03:00
141 changed files with 5475 additions and 2033 deletions

View File

@@ -6,6 +6,11 @@ use utils::id::NodeId;
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;
#[derive(Copy, Clone)]
pub(crate) struct Delete {
pub(crate) node_id: NodeId,
}
#[derive(Copy, Clone)]
pub(crate) struct Drain {
pub(crate) node_id: NodeId,
@@ -18,6 +23,7 @@ pub(crate) struct Fill {
#[derive(Copy, Clone)]
pub(crate) enum Operation {
Delete(Delete),
Drain(Drain),
Fill(Fill),
}
@@ -30,6 +36,8 @@ pub(crate) enum OperationError {
FinalizeError(Cow<'static, str>),
#[error("Operation cancelled")]
Cancelled,
#[error("Impossible constraint error: {0}")]
ImpossibleConstraint(Cow<'static, str>),
}
pub(crate) struct OperationHandler {
@@ -38,6 +46,12 @@ pub(crate) struct OperationHandler {
pub(crate) cancel: CancellationToken,
}
impl Display for Delete {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "delete {}", self.node_id)
}
}
impl Display for Drain {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "drain {}", self.node_id)
@@ -53,6 +67,7 @@ impl Display for Fill {
impl Display for Operation {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Operation::Delete(op) => write!(f, "{op}"),
Operation::Drain(op) => write!(f, "{op}"),
Operation::Fill(op) => write!(f, "{op}"),
}

View File

@@ -919,7 +919,7 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
}
async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
async fn handle_node_delete_old(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let req = match maybe_forward(req).await {
@@ -931,7 +931,10 @@ async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiErr
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
json_response(
StatusCode::OK,
state.service.node_delete_old(node_id).await?,
)
}
async fn handle_tombstone_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1051,6 +1054,42 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
json_response(StatusCode::OK, leader)
}
async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
json_response(
StatusCode::OK,
state.service.start_node_delete(node_id).await?,
)
}
async fn handle_cancel_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Infra)?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
json_response(
StatusCode::ACCEPTED,
state.service.cancel_node_delete(node_id).await?,
)
}
async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Infra)?;
@@ -2221,8 +2260,14 @@ pub fn make_router(
.post("/control/v1/node", |r| {
named_request_span(r, handle_node_register, RequestName("control_v1_node"))
})
// This endpoint is deprecated and will be removed in a future version.
// Use PUT /control/v1/node/:node_id/delete instead.
.delete("/control/v1/node/:node_id", |r| {
named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete"))
named_request_span(
r,
handle_node_delete_old,
RequestName("control_v1_node_delete"),
)
})
.get("/control/v1/node", |r| {
named_request_span(r, handle_node_list, RequestName("control_v1_node"))
@@ -2247,6 +2292,20 @@ pub fn make_router(
.get("/control/v1/leader", |r| {
named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
})
.put("/control/v1/node/:node_id/delete", |r| {
named_request_span(
r,
handle_node_delete,
RequestName("control_v1_start_node_delete"),
)
})
.delete("/control/v1/node/:node_id/delete", |r| {
named_request_span(
r,
handle_cancel_node_delete,
RequestName("control_v1_cancel_node_delete"),
)
})
.put("/control/v1/node/:node_id/drain", |r| {
named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
})

View File

@@ -6,13 +6,13 @@ extern crate hyper0 as hyper;
mod auth;
mod background_node_operations;
mod compute_hook;
mod drain_utils;
mod heartbeater;
pub mod http;
mod id_lock_map;
mod leadership;
pub mod metrics;
mod node;
mod operation_utils;
mod pageserver_client;
mod peer_client;
pub mod persistence;

View File

@@ -76,6 +76,9 @@ pub(crate) struct StorageControllerMetricGroup {
/// How many shards would like to reconcile but were blocked by concurrency limits
pub(crate) storage_controller_pending_reconciles: measured::Gauge,
/// How many shards are keep-failing and will be ignored when considering to run optimizations
pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge,
/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>,

View File

@@ -201,6 +201,7 @@ impl Node {
match self.scheduling {
NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
NodeSchedulingPolicy::Deleting => MaySchedule::No,
NodeSchedulingPolicy::Draining => MaySchedule::No,
NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
NodeSchedulingPolicy::Pause => MaySchedule::No,

View File

@@ -10,63 +10,19 @@ use crate::node::Node;
use crate::scheduler::Scheduler;
use crate::tenant_shard::TenantShard;
pub(crate) struct TenantShardIterator<F> {
tenants_accessor: F,
inspected_all_shards: bool,
last_inspected_shard: Option<TenantShardId>,
}
/// A simple iterator which can be used in tandem with [`crate::service::Service`]
/// to iterate over all known tenant shard ids without holding the lock on the
/// service state at all times.
impl<F> TenantShardIterator<F>
where
F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
{
pub(crate) fn new(tenants_accessor: F) -> Self {
Self {
tenants_accessor,
inspected_all_shards: false,
last_inspected_shard: None,
}
}
/// Returns the next tenant shard id if one exists
pub(crate) fn next(&mut self) -> Option<TenantShardId> {
if self.inspected_all_shards {
return None;
}
match (self.tenants_accessor)(self.last_inspected_shard) {
Some(tid) => {
self.last_inspected_shard = Some(tid);
Some(tid)
}
None => {
self.inspected_all_shards = true;
None
}
}
}
/// Returns true when the end of the iterator is reached and false otherwise
pub(crate) fn finished(&self) -> bool {
self.inspected_all_shards
}
}
/// Check that the state of the node being drained is as expected:
/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`]
/// node is present in memory and scheduling policy is set to expected_policy
pub(crate) fn validate_node_state(
node_id: &NodeId,
nodes: Arc<HashMap<NodeId, Node>>,
expected_policy: NodeSchedulingPolicy,
) -> Result<(), OperationError> {
let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged(
format!("node {node_id} was removed").into(),
))?;
let current_policy = node.get_scheduling();
if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
if current_policy != expected_policy {
// TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
// about it
return Err(OperationError::NodeStateChanged(
@@ -182,55 +138,3 @@ impl TenantShardDrain {
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use utils::id::TenantId;
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
use super::TenantShardIterator;
#[test]
fn test_tenant_shard_iterator() {
let tenant_id = TenantId::generate();
let shard_count = ShardCount(8);
let mut tenant_shards = Vec::default();
for i in 0..shard_count.0 {
tenant_shards.push((
TenantShardId {
tenant_id,
shard_number: ShardNumber(i),
shard_count,
},
(),
))
}
let tenant_shards = Arc::new(tenant_shards);
let mut tid_iter = TenantShardIterator::new({
let tenants = tenant_shards.clone();
move |last_inspected_shard: Option<TenantShardId>| {
let entry = match last_inspected_shard {
Some(skip_past) => {
let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
cursor.nth(1)
}
None => tenants.first(),
};
entry.map(|(tid, _)| tid).copied()
}
});
let mut iterated_over = Vec::default();
while let Some(tid) = tid_iter.next() {
iterated_over.push((tid, ()));
}
assert_eq!(iterated_over, *tenant_shards);
}
}

View File

@@ -635,18 +635,23 @@ impl Persistence {
let updated = self
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
Box::pin(async move {
// Check if the node is not marked as deleted
let deleted_node: i64 = nodes
let node: Option<NodePersistence> = nodes
.filter(node_id.eq(input_node_id.0 as i64))
.filter(lifecycle.eq(String::from(NodeLifecycle::Deleted)))
.count()
.get_result(conn)
.await?;
if deleted_node > 0 {
return Err(DatabaseError::Logical(format!(
"Node {input_node_id} is marked as deleted, re-attach is not allowed"
)));
}
.first::<NodePersistence>(conn)
.await
.optional()?;
// Check if the node is not marked as deleted
match node {
Some(node) if matches!(NodeLifecycle::from_str(&node.lifecycle), Ok(NodeLifecycle::Deleted)) => {
return Err(DatabaseError::Logical(format!(
"Node {input_node_id} is marked as deleted, re-attach is not allowed"
)));
}
_ => {
// go through
}
};
let rows_updated = diesel::update(tenant_shards)
.filter(generation_pageserver.eq(input_node_id.0 as i64))
@@ -664,21 +669,23 @@ impl Persistence {
.load(conn)
.await?;
// If the node went through a drain and restart phase before re-attaching,
// then reset it's node scheduling policy to active.
diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.filter(
scheduling_policy
.eq(String::from(NodeSchedulingPolicy::PauseForRestart))
.or(scheduling_policy
.eq(String::from(NodeSchedulingPolicy::Draining)))
.or(scheduling_policy
.eq(String::from(NodeSchedulingPolicy::Filling))),
)
.set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
.execute(conn)
.await?;
if let Some(node) = node {
let old_scheduling_policy =
NodeSchedulingPolicy::from_str(&node.scheduling_policy).unwrap();
let new_scheduling_policy = match old_scheduling_policy {
NodeSchedulingPolicy::Active => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::PauseForRestart => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::Draining => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::Filling => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::Pause => NodeSchedulingPolicy::Pause,
NodeSchedulingPolicy::Deleting => NodeSchedulingPolicy::Pause,
};
diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.set(scheduling_policy.eq(String::from(new_scheduling_policy)))
.execute(conn)
.await?;
}
Ok(updated)
})
@@ -1388,6 +1395,48 @@ impl Persistence {
.await
}
/// Activate the given safekeeper, ensuring that there is no TOCTOU.
/// Returns `Some` if the safekeeper has indeed been activating (or already active). Other states return `None`.
pub(crate) async fn activate_safekeeper(&self, id_: i64) -> Result<Option<()>, DatabaseError> {
use crate::schema::safekeepers::dsl::*;
self.with_conn(move |conn| {
Box::pin(async move {
#[derive(Insertable, AsChangeset)]
#[diesel(table_name = crate::schema::safekeepers)]
struct UpdateSkSchedulingPolicy<'a> {
id: i64,
scheduling_policy: &'a str,
}
let scheduling_policy_active = String::from(SkSchedulingPolicy::Active);
let scheduling_policy_activating = String::from(SkSchedulingPolicy::Activating);
let rows_affected = diesel::update(
safekeepers.filter(id.eq(id_)).filter(
scheduling_policy
.eq(scheduling_policy_activating)
.or(scheduling_policy.eq(&scheduling_policy_active)),
),
)
.set(scheduling_policy.eq(&scheduling_policy_active))
.execute(conn)
.await?;
if rows_affected == 0 {
return Ok(Some(()));
}
if rows_affected != 1 {
return Err(DatabaseError::Logical(format!(
"unexpected number of rows ({rows_affected})",
)));
}
Ok(Some(()))
})
})
.await
}
/// Persist timeline. Returns if the timeline was newly inserted. If it wasn't, we haven't done any writes.
pub(crate) async fn insert_timeline(&self, entry: TimelinePersistence) -> DatabaseResult<bool> {
use crate::schema::timelines;

View File

@@ -1,8 +1,8 @@
pub mod chaos_injector;
mod context_iterator;
pub mod feature_flag;
pub(crate) mod safekeeper_reconciler;
mod safekeeper_service;
mod tenant_shard_iterator;
use std::borrow::Cow;
use std::cmp::Ordering;
@@ -16,7 +16,6 @@ use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant, SystemTime};
use anyhow::Context;
use context_iterator::TenantShardContextIterator;
use control_plane::storage_controller::{
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
};
@@ -31,8 +30,8 @@ use pageserver_api::controller_api::{
AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use pageserver_api::models::{
@@ -55,6 +54,7 @@ use pageserver_client::{BlockUnblock, mgmt_api};
use reqwest::{Certificate, StatusCode};
use safekeeper_api::models::SafekeeperUtilization;
use safekeeper_reconciler::SafekeeperReconcilers;
use tenant_shard_iterator::{TenantShardExclusiveIterator, create_shared_shard_iterator};
use tokio::sync::TryAcquireError;
use tokio::sync::mpsc::error::TrySendError;
use tokio_util::sync::CancellationToken;
@@ -68,10 +68,9 @@ use utils::sync::gate::{Gate, GateGuard};
use utils::{failpoint_support, pausable_failpoint};
use crate::background_node_operations::{
Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler,
Delete, Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler,
};
use crate::compute_hook::{self, ComputeHook, NotifyError};
use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator};
use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState};
use crate::id_lock_map::{
IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock,
@@ -79,6 +78,7 @@ use crate::id_lock_map::{
use crate::leadership::Leadership;
use crate::metrics;
use crate::node::{AvailabilityTransition, Node};
use crate::operation_utils::{self, TenantShardDrain};
use crate::pageserver_client::PageserverClient;
use crate::peer_client::GlobalObservedState;
use crate::persistence::split_state::SplitState;
@@ -105,7 +105,7 @@ use crate::timeline_import::{
TimelineImportFinalizeError, TimelineImportState, UpcallClient,
};
const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
const WAITER_OPERATION_POLL_TIMEOUT: Duration = Duration::from_millis(500);
// For operations that should be quick, like attaching a new tenant
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -210,6 +210,10 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
// Number of consecutive reconciliation errors, occured for one shard,
// after which the shard is ignored when considering to run optimizations.
const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5;
// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
// than they're being pushed onto the queue.
@@ -577,7 +581,9 @@ impl From<ReconcileWaitError> for ApiError {
impl From<OperationError> for ApiError {
fn from(value: OperationError) -> Self {
match value {
OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => {
OperationError::NodeStateChanged(err)
| OperationError::FinalizeError(err)
| OperationError::ImpossibleConstraint(err) => {
ApiError::InternalServerError(anyhow::anyhow!(err))
}
OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
@@ -702,6 +708,36 @@ struct ShardMutationLocations {
#[derive(Default, Clone)]
struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
struct ReconcileAllResult {
spawned_reconciles: usize,
keep_failing_reconciles: usize,
has_delayed_reconciles: bool,
}
impl ReconcileAllResult {
fn new(
spawned_reconciles: usize,
keep_failing_reconciles: usize,
has_delayed_reconciles: bool,
) -> Self {
assert!(
spawned_reconciles >= keep_failing_reconciles,
"It is impossible to have more keep-failing reconciles than spawned reconciles"
);
Self {
spawned_reconciles,
keep_failing_reconciles,
has_delayed_reconciles,
}
}
/// We can run optimizations only if we don't have any delayed reconciles and
/// all spawned reconciles are also keep-failing reconciles.
fn can_run_optimizations(&self) -> bool {
!self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles
}
}
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
@@ -899,7 +935,7 @@ impl Service {
// which require it: under normal circumstances this should only include tenants that were in some
// transient state before we restarted, or any tenants whose compute hooks failed above.
tracing::info!("Checking for shards in need of reconciliation...");
let reconcile_tasks = self.reconcile_all();
let reconcile_all_result = self.reconcile_all();
// We will not wait for these reconciliation tasks to run here: we're now done with startup and
// normal operations may proceed.
@@ -947,8 +983,9 @@ impl Service {
}
}
let spawned_reconciles = reconcile_all_result.spawned_reconciles;
tracing::info!(
"Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"
"Startup complete, spawned {spawned_reconciles} reconciliation tasks ({shard_count} shards total)"
);
}
@@ -1199,8 +1236,8 @@ impl Service {
while !self.reconcilers_cancel.is_cancelled() {
tokio::select! {
_ = interval.tick() => {
let reconciles_spawned = self.reconcile_all();
if reconciles_spawned == 0 {
let reconcile_all_result = self.reconcile_all();
if reconcile_all_result.can_run_optimizations() {
// Run optimizer only when we didn't find any other work to do
self.optimize_all().await;
}
@@ -1214,7 +1251,7 @@ impl Service {
}
/// Heartbeat all storage nodes once in a while.
#[instrument(skip_all)]
async fn spawn_heartbeat_driver(&self) {
async fn spawn_heartbeat_driver(self: &Arc<Self>) {
self.startup_complete.clone().wait().await;
let mut interval = tokio::time::interval(self.config.heartbeat_interval);
@@ -1341,18 +1378,51 @@ impl Service {
}
}
if let Ok(deltas) = res_sk {
let mut locked = self.inner.write().unwrap();
let mut safekeepers = (*locked.safekeepers).clone();
for (id, state) in deltas.0 {
let Some(sk) = safekeepers.get_mut(&id) else {
tracing::info!(
"Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"
);
continue;
};
sk.set_availability(state);
let mut to_activate = Vec::new();
{
let mut locked = self.inner.write().unwrap();
let mut safekeepers = (*locked.safekeepers).clone();
for (id, state) in deltas.0 {
let Some(sk) = safekeepers.get_mut(&id) else {
tracing::info!(
"Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"
);
continue;
};
if sk.scheduling_policy() == SkSchedulingPolicy::Activating
&& let SafekeeperState::Available { .. } = state
{
to_activate.push(id);
}
sk.set_availability(state);
}
locked.safekeepers = Arc::new(safekeepers);
}
for sk_id in to_activate {
// TODO this can race with set_scheduling_policy (can create disjoint DB <-> in-memory state)
tracing::info!("Activating safekeeper {sk_id}");
match self.persistence.activate_safekeeper(sk_id.0 as i64).await {
Ok(Some(())) => {}
Ok(None) => {
tracing::info!(
"safekeeper {sk_id} has been removed from db or has different scheduling policy than active or activating"
);
}
Err(e) => {
tracing::warn!("couldn't apply activation of {sk_id} to db: {e}");
continue;
}
}
if let Err(e) = self
.set_safekeeper_scheduling_policy_in_mem(sk_id, SkSchedulingPolicy::Active)
.await
{
tracing::info!("couldn't activate safekeeper {sk_id} in memory: {e}");
continue;
}
tracing::info!("Activation of safekeeper {sk_id} done");
}
locked.safekeepers = Arc::new(safekeepers);
}
}
}
@@ -1408,6 +1478,7 @@ impl Service {
match result.result {
Ok(()) => {
tenant.consecutive_errors_count = 0;
tenant.apply_observed_deltas(deltas);
tenant.waiter.advance(result.sequence);
}
@@ -1426,6 +1497,8 @@ impl Service {
}
}
tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1);
// Ordering: populate last_error before advancing error_seq,
// so that waiters will see the correct error after waiting.
tenant.set_last_error(result.sequence, e);
@@ -2343,6 +2416,7 @@ impl Service {
NodeSchedulingPolicy::PauseForRestart
| NodeSchedulingPolicy::Draining
| NodeSchedulingPolicy::Filling
| NodeSchedulingPolicy::Deleting
);
let mut new_nodes = (**nodes).clone();
@@ -6984,7 +7058,7 @@ impl Service {
/// If a node has any work on it, it will be rescheduled: this is "clean" in the sense
/// that we don't leave any bad state behind in the storage controller, but unclean
/// in the sense that we are not carefully draining the node.
pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> {
pub(crate) async fn node_delete_old(&self, node_id: NodeId) -> Result<(), ApiError> {
let _node_lock =
trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await;
@@ -7018,7 +7092,7 @@ impl Service {
}
for (_tenant_id, mut schedule_context, shards) in
TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal)
{
for shard in shards {
if shard.deref_node(node_id) {
@@ -7087,6 +7161,171 @@ impl Service {
Ok(())
}
pub(crate) async fn delete_node(
self: &Arc<Self>,
node_id: NodeId,
policy_on_start: NodeSchedulingPolicy,
cancel: CancellationToken,
) -> Result<(), OperationError> {
let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal).build();
let mut waiters: Vec<ReconcilerWaiter> = Vec::new();
let mut tid_iter = create_shared_shard_iterator(self.clone());
while !tid_iter.finished() {
if cancel.is_cancelled() {
match self
.node_configure(node_id, None, Some(policy_on_start))
.await
{
Ok(()) => return Err(OperationError::Cancelled),
Err(err) => {
return Err(OperationError::FinalizeError(
format!(
"Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}",
node_id, String::from(policy_on_start), err
)
.into(),
));
}
}
}
operation_utils::validate_node_state(
&node_id,
self.inner.read().unwrap().nodes.clone(),
NodeSchedulingPolicy::Deleting,
)?;
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
let tid = match tid_iter.next() {
Some(tid) => tid,
None => {
break;
}
};
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let tenant_shard = match tenants.get_mut(&tid) {
Some(tenant_shard) => tenant_shard,
None => {
// Tenant shard was deleted by another operation. Skip it.
continue;
}
};
match tenant_shard.get_scheduling_policy() {
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {
// A migration during delete is classed as 'essential' because it is required to
// uphold our availability goals for the tenant: this shard is elegible for migration.
}
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
// If we have been asked to avoid rescheduling this shard, then do not migrate it during a deletion
tracing::warn!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Skip migration during deletion because shard scheduling policy {:?} disallows it",
tenant_shard.get_scheduling_policy(),
);
continue;
}
}
if tenant_shard.deref_node(node_id) {
// TODO(ephemeralsad): we should process all shards in a tenant at once, so
// we can avoid settling the tenant unevenly.
let mut schedule_context = ScheduleContext::new(ScheduleMode::Normal);
if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
tracing::error!(
"Refusing to delete node, shard {} can't be rescheduled: {e}",
tenant_shard.tenant_shard_id
);
return Err(OperationError::ImpossibleConstraint(e.to_string().into()));
} else {
tracing::info!(
"Rescheduled shard {} away from node during deletion",
tenant_shard.tenant_shard_id
)
}
let waiter = self.maybe_configured_reconcile_shard(
tenant_shard,
nodes,
reconciler_config,
);
if let Some(some) = waiter {
waiters.push(some);
}
}
}
waiters = self
.await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
.await;
failpoint_support::sleep_millis_async!("sleepy-delete-loop", &cancel);
}
while !waiters.is_empty() {
if cancel.is_cancelled() {
match self
.node_configure(node_id, None, Some(policy_on_start))
.await
{
Ok(()) => return Err(OperationError::Cancelled),
Err(err) => {
return Err(OperationError::FinalizeError(
format!(
"Failed to finalise drain cancel of {} by setting scheduling policy to {}: {}",
node_id, String::from(policy_on_start), err
)
.into(),
));
}
}
}
tracing::info!("Awaiting {} pending delete reconciliations", waiters.len());
waiters = self
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await;
}
self.persistence
.set_tombstone(node_id)
.await
.map_err(|e| OperationError::FinalizeError(e.to_string().into()))?;
{
let mut locked = self.inner.write().unwrap();
let (nodes, _, scheduler) = locked.parts_mut();
scheduler.node_remove(node_id);
let mut nodes_mut = (**nodes).clone();
if let Some(mut removed_node) = nodes_mut.remove(&node_id) {
// Ensure that any reconciler holding an Arc<> to this node will
// drop out when trying to RPC to it (setting Offline state sets the
// cancellation token on the Node object).
removed_node.set_availability(NodeAvailability::Offline);
}
*nodes = Arc::new(nodes_mut);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_nodes
.set(nodes.len() as i64);
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_https_pageserver_nodes
.set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
}
Ok(())
}
pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
let nodes = {
self.inner
@@ -7475,7 +7714,7 @@ impl Service {
let mut tenants_affected: usize = 0;
for (_tenant_id, mut schedule_context, shards) in
TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal)
{
for tenant_shard in shards {
let tenant_shard_id = tenant_shard.tenant_shard_id;
@@ -7646,6 +7885,142 @@ impl Service {
self.node_configure(node_id, availability, scheduling).await
}
pub(crate) async fn start_node_delete(
self: &Arc<Self>,
node_id: NodeId,
) -> Result<(), ApiError> {
let (ongoing_op, node_policy, schedulable_nodes_count) = {
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
let schedulable_nodes_count = nodes
.iter()
.filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
.count();
(
locked
.ongoing_operation
.as_ref()
.map(|ongoing| ongoing.operation),
node.get_scheduling(),
schedulable_nodes_count,
)
};
if let Some(ongoing) = ongoing_op {
return Err(ApiError::PreconditionFailed(
format!("Background operation already ongoing for node: {ongoing}").into(),
));
}
if schedulable_nodes_count == 0 {
return Err(ApiError::PreconditionFailed(
"No other schedulable nodes to move shards".into(),
));
}
match node_policy {
NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Deleting))
.await?;
let cancel = self.cancel.child_token();
let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
let policy_on_start = node_policy;
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
operation: Operation::Delete(Delete { node_id }),
cancel: cancel.clone(),
});
let span = tracing::info_span!(parent: None, "delete_node", %node_id);
tokio::task::spawn(
{
let service = self.clone();
let cancel = cancel.clone();
async move {
let _gate_guard = gate_guard;
scopeguard::defer! {
let prev = service.inner.write().unwrap().ongoing_operation.take();
if let Some(Operation::Delete(removed_delete)) = prev.map(|h| h.operation) {
assert_eq!(removed_delete.node_id, node_id, "We always take the same operation");
} else {
panic!("We always remove the same operation")
}
}
tracing::info!("Delete background operation starting");
let res = service
.delete_node(node_id, policy_on_start, cancel)
.await;
match res {
Ok(()) => {
tracing::info!(
"Delete background operation completed successfully"
);
}
Err(OperationError::Cancelled) => {
tracing::info!("Delete background operation was cancelled");
}
Err(err) => {
tracing::error!(
"Delete background operation encountered: {err}"
)
}
}
}
}
.instrument(span),
);
}
NodeSchedulingPolicy::Deleting => {
return Err(ApiError::Conflict(format!(
"Node {node_id} has delete in progress"
)));
}
policy => {
return Err(ApiError::PreconditionFailed(
format!("Node {node_id} cannot be deleted due to {policy:?} policy").into(),
));
}
}
Ok(())
}
pub(crate) async fn cancel_node_delete(
self: &Arc<Self>,
node_id: NodeId,
) -> Result<(), ApiError> {
{
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
}
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
if let Operation::Delete(delete) = op_handler.operation {
if delete.node_id == node_id {
tracing::info!("Cancelling background delete operation for node {node_id}");
op_handler.cancel.cancel();
return Ok(());
}
}
}
Err(ApiError::PreconditionFailed(
format!("Node {node_id} has no delete in progress").into(),
))
}
pub(crate) async fn start_node_drain(
self: &Arc<Self>,
node_id: NodeId,
@@ -8026,7 +8401,7 @@ impl Service {
/// Returns how many reconciliation tasks were started, or `1` if no reconciles were
/// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
/// available. A return value of 0 indicates that everything is fully reconciled already.
fn reconcile_all(&self) -> usize {
fn reconcile_all(&self) -> ReconcileAllResult {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let pageservers = nodes.clone();
@@ -8034,13 +8409,16 @@ impl Service {
// This function is an efficient place to update lazy statistics, since we are walking
// all tenants.
let mut pending_reconciles = 0;
let mut keep_failing_reconciles = 0;
let mut az_violations = 0;
// If we find any tenants to drop from memory, stash them to offload after
// we're done traversing the map of tenants.
let mut drop_detached_tenants = Vec::new();
let mut reconciles_spawned = 0;
let mut spawned_reconciles = 0;
let mut has_delayed_reconciles = false;
for shard in tenants.values_mut() {
// Accumulate scheduling statistics
if let (Some(attached), Some(preferred)) =
@@ -8060,18 +8438,32 @@ impl Service {
// If there is something delayed, then return a nonzero count so that
// callers like reconcile_all_now do not incorrectly get the impression
// that the system is in a quiescent state.
reconciles_spawned = std::cmp::max(1, reconciles_spawned);
has_delayed_reconciles = true;
pending_reconciles += 1;
continue;
}
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
// dirty, spawn another one
let consecutive_errors_count = shard.consecutive_errors_count;
if self
.maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
.is_some()
{
reconciles_spawned += 1;
spawned_reconciles += 1;
// Count shards that are keep-failing. We still want to reconcile them
// to avoid a situation where a shard is stuck.
// But we don't want to consider them when deciding to run optimizations.
if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS {
tracing::warn!(
tenant_id=%shard.tenant_shard_id.tenant_id,
shard_id=%shard.tenant_shard_id.shard_slug(),
"Shard reconciliation is keep-failing: {} errors",
consecutive_errors_count
);
keep_failing_reconciles += 1;
}
} else if shard.delayed_reconcile {
// Shard wanted to reconcile but for some reason couldn't.
pending_reconciles += 1;
@@ -8110,7 +8502,16 @@ impl Service {
.storage_controller_pending_reconciles
.set(pending_reconciles as i64);
reconciles_spawned
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_keep_failing_reconciles
.set(keep_failing_reconciles as i64);
ReconcileAllResult::new(
spawned_reconciles,
keep_failing_reconciles,
has_delayed_reconciles,
)
}
/// `optimize` in this context means identifying shards which have valid scheduled locations, but
@@ -8196,7 +8597,7 @@ impl Service {
// to ignore the utilisation component of the score.
for (_tenant_id, schedule_context, shards) in
TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
TenantShardExclusiveIterator::new(tenants, ScheduleMode::Speculative)
{
for shard in shards {
if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
@@ -8783,13 +9184,13 @@ impl Service {
/// also wait for any generated Reconcilers to complete. Calling this until it returns zero should
/// put the system into a quiescent state where future background reconciliations won't do anything.
pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
let reconciles_spawned = self.reconcile_all();
let reconciles_spawned = if reconciles_spawned == 0 {
let reconcile_all_result = self.reconcile_all();
let mut spawned_reconciles = reconcile_all_result.spawned_reconciles;
if reconcile_all_result.can_run_optimizations() {
// Only optimize when we are otherwise idle
self.optimize_all().await
} else {
reconciles_spawned
};
let optimization_reconciles = self.optimize_all().await;
spawned_reconciles += optimization_reconciles;
}
let waiters = {
let mut waiters = Vec::new();
@@ -8826,11 +9227,11 @@ impl Service {
tracing::info!(
"{} reconciles in reconcile_all, {} waiters",
reconciles_spawned,
spawned_reconciles,
waiter_count
);
Ok(std::cmp::max(waiter_count, reconciles_spawned))
Ok(std::cmp::max(waiter_count, spawned_reconciles))
}
async fn stop_reconciliations(&self, reason: StopReconciliationsReason) {
@@ -8923,25 +9324,7 @@ impl Service {
let mut waiters = Vec::new();
let mut tid_iter = TenantShardIterator::new({
let service = self.clone();
move |last_inspected_shard: Option<TenantShardId>| {
let locked = &service.inner.read().unwrap();
let tenants = &locked.tenants;
let entry = match last_inspected_shard {
Some(skip_past) => {
// Skip to the last seen tenant shard id
let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
// Skip past the last seen
cursor.nth(1)
}
None => tenants.first_key_value(),
};
entry.map(|(tid, _)| tid).copied()
}
});
let mut tid_iter = create_shared_shard_iterator(self.clone());
while !tid_iter.finished() {
if cancel.is_cancelled() {
@@ -8961,7 +9344,11 @@ impl Service {
}
}
drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?;
operation_utils::validate_node_state(
&node_id,
self.inner.read().unwrap().nodes.clone(),
NodeSchedulingPolicy::Draining,
)?;
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
let tid = match tid_iter.next() {
@@ -9041,7 +9428,7 @@ impl Service {
}
waiters = self
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
.await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
.await;
failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
@@ -9335,7 +9722,7 @@ impl Service {
}
waiters = self
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
.await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
.await;
}

View File

@@ -236,40 +236,30 @@ impl Service {
F: std::future::Future<Output = mgmt_api::Result<T>> + Send + 'static,
T: Sync + Send + 'static,
{
let target_sk_count = safekeepers.len();
if target_sk_count == 0 {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"timeline configured without any safekeepers"
)));
}
if target_sk_count < self.config.timeline_safekeeper_count {
tracing::warn!(
"running a quorum operation with {} safekeepers, which is less than configured {} safekeepers per timeline",
target_sk_count,
self.config.timeline_safekeeper_count
);
}
let results = self
.tenant_timeline_safekeeper_op(safekeepers, op, timeout)
.await?;
// Now check if quorum was reached in results.
let target_sk_count = safekeepers.len();
let quorum_size = match target_sk_count {
0 => {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"timeline configured without any safekeepers",
)));
}
1 | 2 => {
#[cfg(feature = "testing")]
{
// In test settings, it is allowed to have one or two safekeepers
target_sk_count
}
#[cfg(not(feature = "testing"))]
{
// The region is misconfigured: we need at least three safekeepers to be configured
// in order to schedule work to them
tracing::warn!(
"couldn't find at least 3 safekeepers for timeline, found: {:?}",
target_sk_count
);
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"couldn't find at least 3 safekeepers to put timeline to"
)));
}
}
_ => target_sk_count / 2 + 1,
};
let quorum_size = target_sk_count / 2 + 1;
let success_count = results.iter().filter(|res| res.is_ok()).count();
if success_count < quorum_size {
// Failure
@@ -815,7 +805,7 @@ impl Service {
Safekeeper::from_persistence(
crate::persistence::SafekeeperPersistence::from_upsert(
record,
SkSchedulingPolicy::Pause,
SkSchedulingPolicy::Activating,
),
CancellationToken::new(),
use_https,
@@ -856,27 +846,36 @@ impl Service {
.await?;
let node_id = NodeId(id as u64);
// After the change has been persisted successfully, update the in-memory state
{
let mut locked = self.inner.write().unwrap();
let mut safekeepers = (*locked.safekeepers).clone();
let sk = safekeepers
.get_mut(&node_id)
.ok_or(DatabaseError::Logical("Not found".to_string()))?;
sk.set_scheduling_policy(scheduling_policy);
self.set_safekeeper_scheduling_policy_in_mem(node_id, scheduling_policy)
.await
}
match scheduling_policy {
SkSchedulingPolicy::Active => {
locked
.safekeeper_reconcilers
.start_reconciler(node_id, self);
}
SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => {
locked.safekeeper_reconcilers.stop_reconciler(node_id);
}
pub(crate) async fn set_safekeeper_scheduling_policy_in_mem(
self: &Arc<Service>,
node_id: NodeId,
scheduling_policy: SkSchedulingPolicy,
) -> Result<(), DatabaseError> {
let mut locked = self.inner.write().unwrap();
let mut safekeepers = (*locked.safekeepers).clone();
let sk = safekeepers
.get_mut(&node_id)
.ok_or(DatabaseError::Logical("Not found".to_string()))?;
sk.set_scheduling_policy(scheduling_policy);
match scheduling_policy {
SkSchedulingPolicy::Active => {
locked
.safekeeper_reconcilers
.start_reconciler(node_id, self);
}
SkSchedulingPolicy::Decomissioned
| SkSchedulingPolicy::Pause
| SkSchedulingPolicy::Activating => {
locked.safekeeper_reconcilers.stop_reconciler(node_id);
}
locked.safekeepers = Arc::new(safekeepers);
}
locked.safekeepers = Arc::new(safekeepers);
Ok(())
}

View File

@@ -1,4 +1,5 @@
use std::collections::BTreeMap;
use std::sync::Arc;
use utils::id::TenantId;
use utils::shard::TenantShardId;
@@ -6,16 +7,21 @@ use utils::shard::TenantShardId;
use crate::scheduler::{ScheduleContext, ScheduleMode};
use crate::tenant_shard::TenantShard;
use super::Service;
/// Exclusive iterator over all tenant shards.
/// It is used to iterate over consistent tenants state at specific point in time.
///
/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
/// tenant while considering the individual shards within it. This iterator is a helper
/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
/// for the tenant.
pub(super) struct TenantShardContextIterator<'a> {
pub(super) struct TenantShardExclusiveIterator<'a> {
schedule_mode: ScheduleMode,
inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
}
impl<'a> TenantShardContextIterator<'a> {
impl<'a> TenantShardExclusiveIterator<'a> {
pub(super) fn new(
tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
schedule_mode: ScheduleMode,
@@ -27,7 +33,7 @@ impl<'a> TenantShardContextIterator<'a> {
}
}
impl<'a> Iterator for TenantShardContextIterator<'a> {
impl<'a> Iterator for TenantShardExclusiveIterator<'a> {
type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);
fn next(&mut self) -> Option<Self::Item> {
@@ -52,13 +58,93 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
}
}
/// Shared iterator over all tenant shards.
/// It is used to iterate over all tenants without blocking another code, working with tenants
///
/// A simple iterator which can be used in tandem with [`crate::service::Service`]
/// to iterate over all known tenant shard ids without holding the lock on the
/// service state at all times.
pub(crate) struct TenantShardSharedIterator<F> {
tenants_accessor: F,
inspected_all_shards: bool,
last_inspected_shard: Option<TenantShardId>,
}
impl<F> TenantShardSharedIterator<F>
where
F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
{
pub(crate) fn new(tenants_accessor: F) -> Self {
Self {
tenants_accessor,
inspected_all_shards: false,
last_inspected_shard: None,
}
}
pub(crate) fn finished(&self) -> bool {
self.inspected_all_shards
}
}
impl<F> Iterator for TenantShardSharedIterator<F>
where
F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
{
// TODO(ephemeralsad): consider adding schedule context to the iterator
type Item = TenantShardId;
/// Returns the next tenant shard id if one exists
fn next(&mut self) -> Option<Self::Item> {
if self.inspected_all_shards {
return None;
}
match (self.tenants_accessor)(self.last_inspected_shard) {
Some(tid) => {
self.last_inspected_shard = Some(tid);
Some(tid)
}
None => {
self.inspected_all_shards = true;
None
}
}
}
}
pub(crate) fn create_shared_shard_iterator(
service: Arc<Service>,
) -> TenantShardSharedIterator<impl Fn(Option<TenantShardId>) -> Option<TenantShardId>> {
let tenants_accessor = move |last_inspected_shard: Option<TenantShardId>| {
let locked = &service.inner.read().unwrap();
let tenants = &locked.tenants;
let entry = match last_inspected_shard {
Some(skip_past) => {
// Skip to the last seen tenant shard id
let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
// Skip past the last seen
cursor.nth(1)
}
None => tenants.first_key_value(),
};
entry.map(|(tid, _)| tid).copied()
};
TenantShardSharedIterator::new(tenants_accessor)
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::str::FromStr;
use std::sync::Arc;
use pageserver_api::controller_api::PlacementPolicy;
use utils::shard::{ShardCount, ShardNumber};
use utils::id::TenantId;
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
use super::*;
use crate::scheduler::test_utils::make_test_nodes;
@@ -66,7 +152,7 @@ mod tests {
use crate::tenant_shard::tests::make_test_tenant_with_id;
#[test]
fn test_context_iterator() {
fn test_exclusive_shard_iterator() {
// Hand-crafted tenant IDs to ensure they appear in the expected order when put into
// a btreemap & iterated
let mut t_1_shards = make_test_tenant_with_id(
@@ -106,7 +192,7 @@ mod tests {
shard.schedule(&mut scheduler, &mut context).unwrap();
}
let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
let mut iter = TenantShardExclusiveIterator::new(&mut tenants, ScheduleMode::Speculative);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t1_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
@@ -132,4 +218,46 @@ mod tests {
shard.intent.clear(&mut scheduler);
}
}
#[test]
fn test_shared_shard_iterator() {
let tenant_id = TenantId::generate();
let shard_count = ShardCount(8);
let mut tenant_shards = Vec::default();
for i in 0..shard_count.0 {
tenant_shards.push((
TenantShardId {
tenant_id,
shard_number: ShardNumber(i),
shard_count,
},
(),
))
}
let tenant_shards = Arc::new(tenant_shards);
let tid_iter = TenantShardSharedIterator::new({
let tenants = tenant_shards.clone();
move |last_inspected_shard: Option<TenantShardId>| {
let entry = match last_inspected_shard {
Some(skip_past) => {
let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
cursor.nth(1)
}
None => tenants.first(),
};
entry.map(|(tid, _)| tid).copied()
}
});
let mut iterated_over = Vec::default();
for tid in tid_iter {
iterated_over.push((tid, ()));
}
assert_eq!(iterated_over, *tenant_shards);
}
}

View File

@@ -131,6 +131,15 @@ pub(crate) struct TenantShard {
#[serde(serialize_with = "read_last_error")]
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
/// Number of consecutive reconciliation errors that have occurred for this shard.
///
/// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard
/// will be countered as keep-failing in `reconcile_all` calculations. This will lead to
/// allowing optimizations to run even with some failing shards.
///
/// The counter is reset to 0 after a successful reconciliation.
pub(crate) consecutive_errors_count: usize,
/// If we have a pending compute notification that for some reason we weren't able to send,
/// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
/// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope
@@ -594,6 +603,7 @@ impl TenantShard {
waiter: Arc::new(SeqWait::new(Sequence(0))),
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
last_error: Arc::default(),
consecutive_errors_count: 0,
pending_compute_notification: false,
scheduling_policy: ShardSchedulingPolicy::default(),
preferred_node: None,
@@ -1859,6 +1869,7 @@ impl TenantShard {
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
consecutive_errors_count: 0,
pending_compute_notification: false,
delayed_reconcile: false,
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),