Add test_explicit_timeline_creation_storcon and make it work (#11261)

Adds a basic test that makes the storcon issue explicit creation of a
timeline on safeekepers (main storcon PR in #11058). It was adapted from
`test_explicit_timeline_creation` from #11002.

Also, do a bunch of fixes needed to get the test work (the API
definitions weren't correct), and log more stuff when we can't create a
new timeline due to no safekeepers being active.

Part of #9011

---------

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
This commit is contained in:
Arpad Müller
2025-03-17 17:28:21 +01:00
committed by GitHub
parent db30e1669c
commit 56149a046a
6 changed files with 58 additions and 12 deletions

View File

@@ -23,6 +23,7 @@ pub struct TimelineCreateRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub mconf: Configuration,
/// In the PG_VERSION_NUM macro format, like 140017.
pub pg_version: u32,
pub system_id: Option<u64>,
// By default WAL_SEGMENT_SIZE

View File

@@ -81,13 +81,10 @@ impl Client {
}
}
pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<TimelineStatus> {
let uri = format!(
"{}/v1/tenant/{}/timeline/{}",
self.mgmt_api_endpoint, req.tenant_id, req.timeline_id
);
pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<reqwest::Response> {
let uri = format!("{}/v1/tenant/timeline", self.mgmt_api_endpoint);
let resp = self.post(&uri, req).await?;
resp.json().await.map_err(Error::ReceiveBody)
Ok(resp)
}
pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result<PullTimelineResponse> {

View File

@@ -1,6 +1,5 @@
use safekeeper_api::models::{
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
TimelineStatus,
};
use safekeeper_client::mgmt_api::{Client, Result};
use utils::id::{NodeId, TenantId, TimelineId};
@@ -60,7 +59,7 @@ impl SafekeeperClient {
pub(crate) async fn create_timeline(
&self,
req: &TimelineCreateRequest,
) -> Result<TimelineStatus> {
) -> Result<reqwest::Response> {
measured_request!(
"create_timeline",
crate::metrics::Method::Post,

View File

@@ -3804,7 +3804,7 @@ impl Service {
create_mode: models::TimelineCreateRequestMode,
) -> Result<SafekeepersInfo, ApiError> {
let timeline_id = timeline_info.timeline_id;
let pg_version = timeline_info.pg_version;
let pg_version = timeline_info.pg_version * 10000;
// Initially start_lsn is determined by last_record_lsn in pageserver
// response as it does initdb. However, later we persist it and in sk
// creation calls replace with the value from the timeline row if it
@@ -8723,6 +8723,8 @@ impl Service {
pub(crate) async fn safekeepers_for_new_timeline(
&self,
) -> Result<Vec<SafekeeperInfo>, ApiError> {
// Number of safekeepers in different AZs we are looking for
let wanted_count = 3;
let mut all_safekeepers = {
let locked = self.inner.read().unwrap();
locked
@@ -8768,15 +8770,17 @@ impl Service {
continue;
}
sks.push(sk_info.clone());
if sks.len() == 3 {
if sks.len() == wanted_count {
break;
}
}
if sks.len() == 3 {
if sks.len() == wanted_count {
Ok(sks)
} else {
Err(ApiError::InternalServerError(anyhow::anyhow!(
"couldn't find three safekeepers in different AZs for new timeline"
"couldn't find {wanted_count} safekeepers in different AZs for new timeline (found: {}, total active: {})",
sks.len(),
all_safekeepers.len(),
)))
}
}

View File

@@ -1321,6 +1321,28 @@ class NeonEnv:
for f in futs:
f.result()
# Last step: register safekeepers at the storage controller
if (
self.storage_controller_config is not None
and self.storage_controller_config.get("timelines_onto_safekeepers") is True
):
for sk_id, sk in enumerate(self.safekeepers):
body = {
"id": sk_id,
"created_at": "2023-10-25T09:11:25Z",
"updated_at": "2024-08-28T11:32:43Z",
"region_id": "aws-us-east-2",
"host": "127.0.0.1",
"port": sk.port.pg,
"http_port": sk.port.http,
"https_port": None,
"version": 5957,
"availability_zone_id": f"us-east-2b-{sk_id}",
}
self.storage_controller.on_safekeeper_deploy(sk_id, body)
self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active")
def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
"""
After this method returns, there should be no child processes running.

View File

@@ -2039,6 +2039,29 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
def test_explicit_timeline_creation_storcon(neon_env_builder: NeonEnvBuilder):
"""
Test that having neon.safekeepers starting with g#n: with non zero n enables
generations, which as a side effect disables automatic timeline creation.
Like test_explicit_timeline_creation, but asks the storcon to
create membership conf & timeline.
"""
neon_env_builder.num_safekeepers = 3
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": True,
}
env = neon_env_builder.init_start()
config_lines = [
"neon.safekeeper_proto_version = 3",
]
ep = env.endpoints.create("main", config_lines=config_lines)
# endpoint should start.
ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
# when compute is active, but there are no writes to the timeline. In that case
# pageserver should maintain a single connection to safekeeper and don't attempt