mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
pageserver: implement auto-splitting (#7681)
## Problem Currently tenants are only split into multiple shards if a human being calls the API to do it. Issue: #7388 ## Summary of changes - Add a pageserver API for returning the top tenants by size - Add a step to the controller's background loop where if there is no reconciliation or optimization to be done, it looks for things to split. - Add a test that runs pgbench on many tenants concurrently, and checks that splitting happens as expected as tenants grow, without interrupting the client I/O. This PR is quite basic: there is a tasklist in https://github.com/neondatabase/neon/issues/7388 for further work. This PR is meant to be safe (off by default), and sufficient to enable our staging environment to run lots of sharded tenants without a human having to set them up.
This commit is contained in:
@@ -66,6 +66,10 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
max_unavailable_interval: Option<humantime::Duration>,
|
||||
|
||||
/// Size threshold for automatically splitting shards (disabled by default)
|
||||
#[arg(long)]
|
||||
split_threshold: Option<u64>,
|
||||
|
||||
/// Maximum number of reconcilers that may run in parallel
|
||||
#[arg(long)]
|
||||
reconciler_concurrency: Option<usize>,
|
||||
@@ -255,6 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
reconciler_concurrency: args
|
||||
.reconciler_concurrency
|
||||
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
|
||||
split_threshold: args.split_threshold,
|
||||
};
|
||||
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
|
||||
@@ -2,7 +2,7 @@ use pageserver_api::{
|
||||
models::{
|
||||
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
|
||||
TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -234,4 +234,16 @@ impl PageserverClient {
|
||||
self.inner.get_utilization().await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn top_tenant_shards(
|
||||
&self,
|
||||
request: TopTenantShardsRequest,
|
||||
) -> Result<TopTenantShardsResponse> {
|
||||
measured_request!(
|
||||
"top_tenants",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner.top_tenant_shards(request).await
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,10 +32,10 @@ use pageserver_api::{
|
||||
TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
UtilizationScore,
|
||||
},
|
||||
models::{SecondaryProgress, TenantConfigRequest},
|
||||
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
|
||||
};
|
||||
use reqwest::StatusCode;
|
||||
use tracing::instrument;
|
||||
use tracing::{instrument, Instrument};
|
||||
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use pageserver_api::{
|
||||
@@ -222,6 +222,10 @@ pub struct Config {
|
||||
|
||||
/// How many Reconcilers may be spawned concurrently
|
||||
pub reconciler_concurrency: usize,
|
||||
|
||||
/// How large must a shard grow in bytes before we split it?
|
||||
/// None disables auto-splitting.
|
||||
pub split_threshold: Option<u64>,
|
||||
}
|
||||
|
||||
impl From<DatabaseError> for ApiError {
|
||||
@@ -699,7 +703,7 @@ impl Service {
|
||||
/// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
|
||||
/// for those retries.
|
||||
#[instrument(skip_all)]
|
||||
async fn background_reconcile(&self) {
|
||||
async fn background_reconcile(self: &Arc<Self>) {
|
||||
self.startup_complete.clone().wait().await;
|
||||
|
||||
const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
|
||||
@@ -711,7 +715,11 @@ impl Service {
|
||||
let reconciles_spawned = self.reconcile_all();
|
||||
if reconciles_spawned == 0 {
|
||||
// Run optimizer only when we didn't find any other work to do
|
||||
self.optimize_all().await;
|
||||
let optimizations = self.optimize_all().await;
|
||||
if optimizations == 0 {
|
||||
// Run new splits only when no optimizations are pending
|
||||
self.autosplit_tenants().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = self.cancel.cancelled() => return
|
||||
@@ -4766,6 +4774,104 @@ impl Service {
|
||||
validated_work
|
||||
}
|
||||
|
||||
/// Look for shards which are oversized and in need of splitting
|
||||
async fn autosplit_tenants(self: &Arc<Self>) {
|
||||
let Some(split_threshold) = self.config.split_threshold else {
|
||||
// Auto-splitting is disabled
|
||||
return;
|
||||
};
|
||||
|
||||
let nodes = self.inner.read().unwrap().nodes.clone();
|
||||
|
||||
const SPLIT_TO_MAX: ShardCount = ShardCount::new(8);
|
||||
|
||||
let mut top_n = Vec::new();
|
||||
|
||||
// Call into each node to look for big tenants
|
||||
let top_n_request = TopTenantShardsRequest {
|
||||
// We currently split based on logical size, for simplicity: logical size is a signal of
|
||||
// the user's intent to run a large database, whereas physical/resident size can be symptoms
|
||||
// of compaction issues. Eventually we should switch to using resident size to bound the
|
||||
// disk space impact of one shard.
|
||||
order_by: models::TenantSorting::MaxLogicalSize,
|
||||
limit: 10,
|
||||
where_shards_lt: Some(SPLIT_TO_MAX),
|
||||
where_gt: Some(split_threshold),
|
||||
};
|
||||
for node in nodes.values() {
|
||||
let request_ref = &top_n_request;
|
||||
match node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
let request = request_ref.clone();
|
||||
client.top_tenant_shards(request.clone()).await
|
||||
},
|
||||
&self.config.jwt_token,
|
||||
3,
|
||||
3,
|
||||
Duration::from_secs(5),
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok(node_top_n)) => {
|
||||
top_n.extend(node_top_n.shards.into_iter());
|
||||
}
|
||||
Some(Err(mgmt_api::Error::Cancelled)) => {
|
||||
continue;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
tracing::warn!("Failed to fetch top N tenants from {node}: {e}");
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
// Node is shutting down
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Pick the biggest tenant to split first
|
||||
top_n.sort_by_key(|i| i.resident_size);
|
||||
let Some(split_candidate) = top_n.into_iter().next() else {
|
||||
tracing::debug!("No split-elegible shards found");
|
||||
return;
|
||||
};
|
||||
|
||||
// We spawn a task to run this, so it's exactly like some external API client requesting it. We don't
|
||||
// want to block the background reconcile loop on this.
|
||||
tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}");
|
||||
|
||||
let this = self.clone();
|
||||
tokio::spawn(
|
||||
async move {
|
||||
match this
|
||||
.tenant_shard_split(
|
||||
split_candidate.id.tenant_id,
|
||||
TenantShardSplitRequest {
|
||||
// Always split to the max number of shards: this avoids stepping through
|
||||
// intervening shard counts and encountering the overrhead of a split+cleanup
|
||||
// each time as a tenant grows, and is not too expensive because our max shard
|
||||
// count is relatively low anyway.
|
||||
// This policy will be adjusted in future once we support higher shard count.
|
||||
new_shard_count: SPLIT_TO_MAX.literal(),
|
||||
new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
},
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
tracing::info!("Successful auto-split");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Auto-split failed: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)),
|
||||
);
|
||||
}
|
||||
|
||||
/// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
|
||||
/// also wait for any generated Reconcilers to complete. Calling this until it returns zero should
|
||||
/// put the system into a quiescent state where future background reconciliations won't do anything.
|
||||
|
||||
Reference in New Issue
Block a user