diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index e824595841..7db80b536a 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -612,6 +612,8 @@ async fn handle_tenant_create(mut req: Request) -> Result, })?; } + locked.save().await.map_err(ApiError::InternalServerError)?; + json_response( StatusCode::OK, TenantCreateResponse { @@ -842,6 +844,17 @@ async fn handle_tenant_shard_split(mut req: Request) -> Result>() + .join(",") + ); + replacements.insert(*tenant_shard_id, response.new_shards); } @@ -863,6 +876,8 @@ async fn handle_tenant_shard_split(mut req: Request) -> Result) -> Result("shard-count").cloned().unwrap_or(0); let attachment_service = AttachmentService::from_env(env); - attachment_service + let result = attachment_service .tenant_split(tenant_id, shard_count) .await?; + println!( + "Split tenant {} into shards {}", + tenant_id, + result + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index b58fe00e2f..d33d827581 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -152,7 +152,7 @@ pub async fn migrate_tenant( let cplane = ComputeControlPlane::load(env.clone())?; for (endpoint_name, endpoint) in &cplane.endpoints { - if endpoint.tenant_id == tenant_shard_id.tenant_id { + if endpoint.tenant_id == tenant_shard_id.tenant_id && endpoint.status() == "running" { println!( "🔁 Reconfiguring endpoint {} to use pageserver {}", endpoint_name, dest_ps.conf.id @@ -178,19 +178,24 @@ pub async fn migrate_tenant( continue; } - // Downgrade to a secondary location - let secondary_conf = build_location_config( - LocationConfigMode::Secondary, - None, - Some(LocationConfigSecondary { warm: true }), - ); + // // Downgrade to a secondary location + // let secondary_conf = build_location_config( + // LocationConfigMode::Secondary, + // None, + // Some(LocationConfigSecondary { warm: true }), + // ); - println!( - "💤 Switching to secondary mode on pageserver {}", - other_ps.conf.id - ); + // println!( + // "💤 Switching to secondary mode on pageserver {}", + // other_ps.conf.id + // ); + // other_ps + // .location_config(tenant_shard_id, secondary_conf, None) + // .await?; + let detached_conf = build_location_config(LocationConfigMode::Detached, None, None); + println!("💤 Detaching on pageserver {}", other_ps.conf.id); other_ps - .location_config(tenant_shard_id, secondary_conf, None) + .location_config(tenant_shard_id, detached_conf, None) .await?; } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 2b04ff5a3d..7a274e8dd5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -405,13 +405,20 @@ impl PageServerHandler { // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant // that we look up here may not be the one that serves all the actual requests: we will double // check the mapping of key->shard later before calling into Timeline for getpage requests. - let tenant = mgr::get_active_tenant_with_timeout( + let tenant = match mgr::get_active_tenant_with_timeout( tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT, &task_mgr::shutdown_token(), ) - .await?; + .await + { + Ok(t) => t, + Err(e) => { + tracing::warn!("Error at start of handle_pagerequests: {}", e); + return Err(e.into()); + } + }; // Make request tracer if needed let mut tracer = if tenant.get_trace_read_requests() { @@ -426,9 +433,18 @@ impl PageServerHandler { }; // Check that the timeline exists - let timeline = tenant - .get_timeline(timeline_id, true) - .map_err(|e| anyhow::anyhow!(e))?; + let timeline = match tenant.get_timeline(timeline_id, true) { + Ok(t) => t, + Err(e) => { + tracing::warn!("Error getting timeline: {}", e); + return Err(QueryError::Other(anyhow::anyhow!(e))); + } + }; + + tracing::info!( + "handle_pagerequests: got timeline {}", + timeline.tenant_shard_id + ); // Avoid starting new requests if the timeline has already started shutting down, // and block timeline shutdown until this request is complete, or drops out due @@ -815,6 +831,10 @@ impl PageServerHandler { let key = rel_block_to_key(req.rel, req.blkno); let page = if timeline.get_shard_identity().is_key_local(&key) { + tracing::debug!( + "handle_get_page_at_lsn: using shard {}", + timeline.tenant_shard_id + ); timeline .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx) .await? @@ -851,6 +871,11 @@ impl PageServerHandler { Err(e) => return Err(e.into()), }; + tracing::debug!( + "handle_get_page_at_lsn: using shard {}", + timeline.tenant_shard_id + ); + // Take a GateGuard for the duration of this request. If we were using our main Timeline object, // the GateGuard was already held over the whole connection. let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e49e9c277e..d6cd3e3da7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1533,6 +1533,7 @@ impl Tenant { })?; if active_only && !timeline.is_active() { + tracing::warn!("Timeline {} is not active", timeline.timeline_id); Err(GetTimelineError::NotActive { tenant_id: self.tenant_shard_id.tenant_id, timeline_id, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1e84fa1848..1f7b7d0092 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -903,10 +903,15 @@ impl Timeline { background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { + tracing::info!("activate 1"); self.spawn_initial_logical_size_computation_task(ctx); + tracing::info!("activate 2"); self.launch_wal_receiver(ctx, broker_client); + tracing::info!("activate 3"); self.set_state(TimelineState::Active); + tracing::info!("activate 4"); self.launch_eviction_task(background_jobs_can_start); + tracing::info!("activate 5"); } /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then