control_plane: support for shard splitting

2026-05-15 20:20:38 +00:00 · 2024-01-03 14:53:51 +00:00
parent e9f7510abf
commit 71ff404e6c
5 changed files with 238 additions and 4 deletions
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -2,7 +2,7 @@ use crate::reconciler::ReconcileError;
 use crate::service::Service;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response};
-use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::models::{TenantCreateRequest, TenantShardSplitRequest, TimelineCreateRequest};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
 use utils::http::endpoint::request_span;
@@ -129,6 +129,20 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }

+async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_shard_split(tenant_id, split_req)
+            .await?,
+    )
+}
+
 async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
@@ -172,6 +186,9 @@ pub fn make_router(service: Arc<Service>) -> RouterBuilder<hyper::Body, ApiError
        .get("/tenant/:tenant_id/locate", |r| {
            request_span(r, handle_tenant_locate)
        })
+        .put("/tenant/:tenant_id/shard_split", |r| {
+            request_span(r, handle_tenant_shard_split)
+        })
        .put("/tenant/:tenant_shard_id/migrate", |r| {
            request_span(r, handle_tenant_shard_migrate)
        })
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,7 +16,10 @@ use pageserver_api::{
        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
        ValidateResponse, ValidateResponseTenant,
    },
-    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    models::{
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
+    },
    shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId},
 };
 use reqwest::Client;
@@ -29,8 +32,12 @@ use utils::{
 use crate::{
    compute_hook::ComputeHook,
    node::Node,
+    reconciler::attached_location_conf,
    scheduler::Scheduler,
-    tenant_state::{ReconcileResult, ReconcilerWaiter, TenantState},
+    tenant_state::{
+        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcilerWaiter,
+        TenantState,
+    },
    PlacementPolicy,
 };

@@ -524,6 +531,167 @@ impl Service {
        })
    }

+    pub(crate) async fn tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        split_req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
+        let mut policy = None;
+        let (targets, compute_hook) = {
+            let mut locked = self.inner.write().unwrap();
+
+            let pageservers = locked.nodes.clone();
+
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in locked
+                .tenants
+                .range_mut(TenantShardId::tenant_range(tenant_id))
+            {
+                if policy.is_none() {
+                    policy = Some(shard.policy.clone());
+                }
+
+                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
+                    tracing::warn!(
+                        "Tenant shard {} already has shard count {}",
+                        tenant_shard_id,
+                        split_req.new_shard_count
+                    );
+                    continue;
+                }
+
+                let node_id =
+                    shard
+                        .intent
+                        .attached
+                        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                            "Cannot split a tenant that is not attached"
+                        )))?;
+
+                let node = pageservers
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                // TODO: if any reconciliation is currently in progress for this shard, wait for it.
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            (targets, locked.compute_hook.clone())
+        };
+
+        let mut replacements = HashMap::new();
+        for (tenant_shard_id, node) in targets {
+            let client = Client::new();
+            let response = client
+                .request(
+                    Method::PUT,
+                    format!("{}/tenant/{}/shard_split", node.base_url(), tenant_shard_id),
+                )
+                .json(&TenantShardSplitRequest {
+                    new_shard_count: split_req.new_shard_count,
+                })
+                .send()
+                .await
+                .map_err(|e| {
+                    ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
+                })?;
+            response.error_for_status_ref().map_err(|e| {
+                ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
+            })?;
+            let response: TenantShardSplitResponse = response.json().await.map_err(|e| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Malformed response from pageserver: {}",
+                    e
+                ))
+            })?;
+
+            tracing::info!(
+                "Split {} into {}",
+                tenant_shard_id,
+                response
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+
+            replacements.insert(tenant_shard_id, response.new_shards);
+        }
+
+        // TODO: concurrency: we're dropping the state lock while issuing split API calls.
+        //       We should add some marker to the TenantState that causes any other change
+        //       to refuse until the split is complete.  This will be related to a persistent
+        //       splitting marker that will ensure resume after crash.
+
+        // Replace all the shards we just split with their children
+        let mut response = TenantShardSplitResponse {
+            new_shards: Vec::new(),
+        };
+        let mut child_locations = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+            for (replaced, children) in replacements.into_iter() {
+                let (pageserver, generation, shard_ident, config) = {
+                    let old_state = locked
+                        .tenants
+                        .remove(&replaced)
+                        .expect("It was present, we just split it");
+                    (
+                        old_state.intent.attached.unwrap(),
+                        old_state.generation,
+                        old_state.shard,
+                        old_state.config.clone(),
+                    )
+                };
+
+                locked.tenants.remove(&replaced);
+
+                for child in children {
+                    let mut child_shard = shard_ident;
+                    child_shard.number = child.shard_number;
+                    child_shard.count = child.shard_count;
+
+                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
+                    child_observed.insert(
+                        pageserver,
+                        ObservedStateLocation {
+                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
+                        },
+                    );
+
+                    let mut child_state = TenantState::new(
+                        child,
+                        child_shard,
+                        policy
+                            .clone()
+                            .expect("We set this if any replacements are pushed"),
+                    );
+                    child_state.intent = IntentState::single(Some(pageserver));
+                    child_state.observed = ObservedState {
+                        locations: child_observed,
+                    };
+                    child_state.generation = generation;
+                    child_state.config = config.clone();
+
+                    child_locations.push((child, pageserver));
+
+                    locked.tenants.insert(child, child_state);
+                    response.new_shards.push(child);
+                }
+            }
+        }
+
+        for (child_id, child_ps) in child_locations {
+            if let Err(e) = compute_hook.notify(child_id, child_ps).await {
+                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
+                        child_id, child_ps);
+            }
+        }
+        Ok(response)
+    }
+
    pub(crate) async fn tenant_shard_migrate(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -133,6 +133,13 @@ impl IntentState {
        result
    }

+    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
+        Self {
+            attached: node_id,
+            secondary: vec![],
+        }
+    }
+
    /// When a node goes offline, we update intents to avoid using it
    /// as their attached pageserver.
    ///
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -3,7 +3,10 @@ use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use hyper::{Method, StatusCode};
 use pageserver_api::{
-    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    models::{
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
+    },
    shard::TenantShardId,
 };
 use postgres_connection::parse_host_port;
@@ -344,6 +347,20 @@ impl AttachmentService {
        .await
    }

+    #[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
+    pub async fn tenant_split(
+        &self,
+        tenant_id: TenantId,
+        new_shard_count: u8,
+    ) -> anyhow::Result<TenantShardSplitResponse> {
+        self.dispatch(
+            Method::PUT,
+            format!("tenant/{tenant_id}/shard_split"),
+            Some(TenantShardSplitRequest { new_shard_count }),
+        )
+        .await
+    }
+
    #[instrument(skip_all, fields(node_id=%req.node_id))]
    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -572,6 +572,26 @@ async fn handle_tenant(
            println!("{tenant_table}");
            println!("{shard_table}");
        }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+
+            let attachment_service = AttachmentService::from_env(env);
+            let result = attachment_service
+                .tenant_split(tenant_id, shard_count)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }
+
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -1514,6 +1534,11 @@ fn cli() -> Command {
            .subcommand(Command::new("status")
                .about("Human readable summary of the tenant's shards and attachment locations")
                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                )
        )
        .subcommand(
            Command::new("pageserver")