Add test_explicit_timeline_creation_storcon and make it work (#11261)

Adds a basic test that makes the storcon issue explicit creation of a timeline on safeekepers (main storcon PR in #11058). It was adapted from `test_explicit_timeline_creation` from #11002. Also, do a bunch of fixes needed to get the test work (the API definitions weren't correct), and log more stuff when we can't create a new timeline due to no safekeepers being active. Part of #9011 --------- Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
2025-12-23 06:09:59 +00:00 · 2025-03-17 17:28:21 +01:00
parent db30e1669c
commit 56149a046a
6 changed files with 58 additions and 12 deletions
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -23,6 +23,7 @@ pub struct TimelineCreateRequest {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mconf: Configuration,
+    /// In the PG_VERSION_NUM macro format, like 140017.
    pub pg_version: u32,
    pub system_id: Option<u64>,
    // By default WAL_SEGMENT_SIZE
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -81,13 +81,10 @@ impl Client {
        }
    }

-    pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<TimelineStatus> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}",
-            self.mgmt_api_endpoint, req.tenant_id, req.timeline_id
-        );
+    pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<reqwest::Response> {
+        let uri = format!("{}/v1/tenant/timeline", self.mgmt_api_endpoint);
        let resp = self.post(&uri, req).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
+        Ok(resp)
    }

    pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result<PullTimelineResponse> {
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -1,6 +1,5 @@
 use safekeeper_api::models::{
    self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
-    TimelineStatus,
 };
 use safekeeper_client::mgmt_api::{Client, Result};
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -60,7 +59,7 @@ impl SafekeeperClient {
    pub(crate) async fn create_timeline(
        &self,
        req: &TimelineCreateRequest,
-    ) -> Result<TimelineStatus> {
+    ) -> Result<reqwest::Response> {
        measured_request!(
            "create_timeline",
            crate::metrics::Method::Post,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3804,7 +3804,7 @@ impl Service {
        create_mode: models::TimelineCreateRequestMode,
    ) -> Result<SafekeepersInfo, ApiError> {
        let timeline_id = timeline_info.timeline_id;
-        let pg_version = timeline_info.pg_version;
+        let pg_version = timeline_info.pg_version * 10000;
        // Initially start_lsn is determined by last_record_lsn in pageserver
        // response as it does initdb. However, later we persist it and in sk
        // creation calls replace with the value from the timeline row if it
@@ -8723,6 +8723,8 @@ impl Service {
    pub(crate) async fn safekeepers_for_new_timeline(
        &self,
    ) -> Result<Vec<SafekeeperInfo>, ApiError> {
+        // Number of safekeepers in different AZs we are looking for
+        let wanted_count = 3;
        let mut all_safekeepers = {
            let locked = self.inner.read().unwrap();
            locked
@@ -8768,15 +8770,17 @@ impl Service {
                continue;
            }
            sks.push(sk_info.clone());
-            if sks.len() == 3 {
+            if sks.len() == wanted_count {
                break;
            }
        }
-        if sks.len() == 3 {
+        if sks.len() == wanted_count {
            Ok(sks)
        } else {
            Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "couldn't find three safekeepers in different AZs for new timeline"
+                "couldn't find {wanted_count} safekeepers in different AZs for new timeline (found: {}, total active: {})",
+                sks.len(),
+                all_safekeepers.len(),
            )))
        }
    }
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1321,6 +1321,28 @@ class NeonEnv:
        for f in futs:
            f.result()

+        # Last step: register safekeepers at the storage controller
+        if (
+            self.storage_controller_config is not None
+            and self.storage_controller_config.get("timelines_onto_safekeepers") is True
+        ):
+            for sk_id, sk in enumerate(self.safekeepers):
+                body = {
+                    "id": sk_id,
+                    "created_at": "2023-10-25T09:11:25Z",
+                    "updated_at": "2024-08-28T11:32:43Z",
+                    "region_id": "aws-us-east-2",
+                    "host": "127.0.0.1",
+                    "port": sk.port.pg,
+                    "http_port": sk.port.http,
+                    "https_port": None,
+                    "version": 5957,
+                    "availability_zone_id": f"us-east-2b-{sk_id}",
+                }
+
+                self.storage_controller.on_safekeeper_deploy(sk_id, body)
+                self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active")
+
    def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
        """
        After this method returns, there should be no child processes running.
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2039,6 +2039,29 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
    ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")


+def test_explicit_timeline_creation_storcon(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that having neon.safekeepers starting with g#n: with non zero n enables
+    generations, which as a side effect disables automatic timeline creation.
+    Like test_explicit_timeline_creation, but asks the storcon to
+    create membership conf & timeline.
+    """
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+    }
+    env = neon_env_builder.init_start()
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    ep = env.endpoints.create("main", config_lines=config_lines)
+
+    # endpoint should start.
+    ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
+    ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt