control plane: improve handling of stripe size

2026-07-05 21:20:37 +00:00 · 2023-12-12 15:07:01 +00:00
parent bc90272e47
commit 3ccf5abc9d
4 changed files with 65 additions and 11 deletions
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -3,7 +3,10 @@ use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use hyper::{Method, StatusCode};
 use pageserver_api::{
-    models::{TenantCreateRequest, TenantShardSplitRequest, TimelineCreateRequest, TimelineInfo},
+    models::{
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TimelineCreateRequest,
+        TimelineInfo,
+    },
    shard::TenantShardId,
 };
 use postgres_connection::parse_host_port;
@@ -79,6 +82,7 @@ pub struct TenantLocateResponseShard {
 #[derive(Serialize, Deserialize)]
 pub struct TenantLocateResponse {
    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
 }

 impl AttachmentService {
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -9,7 +9,7 @@ use clap::Parser;
 use hyper::{Body, Request, Response};
 use hyper::{Method, StatusCode};
 use pageserver_api::models::{
-    LocationConfig, LocationConfigMode, TenantConfig, TenantCreateRequest,
+    LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
    TenantLocationConfigRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    TimelineCreateRequest, TimelineInfo,
 };
@@ -514,8 +514,9 @@ async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>,
    let mut locked = state.write().await;

    tracing::info!(
-        "Creating tenant {}, have {} pageservers",
+        "Creating tenant {}, shard_count={:?}, have {} pageservers",
        create_req.new_tenant_id,
+        create_req.shard_parameters.count,
        locked.pageservers.len()
    );

@@ -548,6 +549,7 @@ async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>,
            shard_number,
            shard_count: create_req.shard_parameters.count,
        };
+        tracing::info!("Creating shard {tenant_shard_id}...");

        use std::collections::btree_map::Entry;
        match locked.tenants.entry(tenant_shard_id) {
@@ -685,6 +687,7 @@ async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiE
    let pageservers = locked.pageservers.clone();

    let mut result = Vec::new();
+    let mut shard_params: Option<ShardParameters> = None;

    for (tenant_shard_id, shard) in locked
        .tenants
@@ -708,9 +711,40 @@ async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiE
            listen_pg_addr: node.listen_pg_addr.clone(),
            listen_pg_port: node.listen_pg_port,
        });
+
+        match &shard_params {
+            None => {
+                shard_params = Some(ShardParameters {
+                    stripe_size: Some(shard.shard.stripe_size),
+                    count: shard.shard.count,
+                });
+            }
+            Some(params) => {
+                if params.stripe_size != Some(shard.shard.stripe_size) {
+                    // This should never happen.  We enforce at runtime because it's simpler than
+                    // adding an extra per-tenant data structure to store the things that should be the same
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Inconsistent shard stripe size parameters!"
+                    )));
+                }
+            }
+        }
    }

-    json_response(StatusCode::OK, TenantLocateResponse { shards: result })
+    if result.is_empty() {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("No shards for this tenant ID found").into(),
+        ));
+    }
+    let shard_params = shard_params.expect("result is non-empty, therefore this is set");
+
+    json_response(
+        StatusCode::OK,
+        TenantLocateResponse {
+            shards: result,
+            shard_params,
+        },
+    )
 }

 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -774,7 +808,7 @@ async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Bo
        let client = Client::new();
        let response = client
            .request(
-                Method::POST,
+                Method::PUT,
                format!("{}/tenant/{}/shard_split", node.base_url(), tenant_shard_id),
            )
            .json(&TenantShardSplitRequest {
@@ -788,6 +822,9 @@ async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Bo
        // response.error_for_status().map_err(|e| {
        //     ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
        // })?;
+        response.error_for_status_ref().map_err(|e| {
+            ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
+        })?;
        let response: TenantShardSplitResponse = response.json().await.map_err(|e| {
            ApiError::InternalServerError(anyhow::anyhow!(
                "Malformed response from pageserver: {}",
@@ -799,6 +836,9 @@ async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Bo
    }

    // Replace all the shards we just split with their children
+    let mut response = TenantShardSplitResponse {
+        new_shards: Vec::new(),
+    };
    for (replaced, children) in replacements.into_iter() {
        let (pageserver, generation, shard_ident, config) = {
            let old_state = locked
@@ -847,10 +887,12 @@ async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Bo
                    config: config.clone(),
                },
            );
+
+            response.new_shards.push(child);
        }
    }

-    json_response(StatusCode::OK, ())
+    json_response(StatusCode::OK, response)
 }

 /// Status endpoint is just used for checking that our HTTP listener is up
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -427,7 +427,7 @@ async fn handle_tenant(
            let shard_count: u8 = create_match
                .get_one::<u8>("shard-count")
                .cloned()
-                .unwrap_or(1);
+                .unwrap_or(0);

            let shard_stripe_size: Option<u32> =
                create_match.get_one::<u32>("shard-stripe-size").cloned();
@@ -909,9 +909,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
            )?;

            let attachment_service = AttachmentService::from_env(env);
-            let pageservers = attachment_service
-                .tenant_locate(endpoint.tenant_id)
-                .await?
+            let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+            let pageservers = locate_result
                .shards
                .into_iter()
                .map(|shard| {
@@ -922,6 +921,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                    )
                })
                .collect::<Vec<_>>();
+            let stripe_size = locate_result.shard_params.stripe_size.map(|s| s.0 as usize);

            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -934,7 +934,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            println!("Starting existing endpoint {endpoint_id}...");
            endpoint
-                .start(&auth_token, safekeepers, pageservers, remote_ext_config)
+                .start(
+                    &auth_token,
+                    safekeepers,
+                    pageservers,
+                    remote_ext_config,
+                    stripe_size,
+                )
                .await?;
        }
        "reconfigure" => {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -466,6 +466,7 @@ impl Endpoint {
        safekeepers: Vec<NodeId>,
        pageservers: Vec<(Host, u16)>,
        remote_ext_config: Option<&String>,
+        shard_stripe_size: Option<usize>,
    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
@@ -529,6 +530,7 @@ impl Endpoint {
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
            remote_extensions,
+            shard_stripe_size,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;