From 6f8b1eb5a6d66f111ea6143b56ab185f6c3244d6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:21:51 -0500
Subject: [PATCH] test(pageserver): add detach ancestor smoke test (#9842)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/9682, hopefully
we can detect some issues or assure ourselves that this is ready for
production.

## Summary of changes

* Add a compaction-detach-ancestor smoke test.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/pageserver/http.py       |  2 +-
 test_runner/fixtures/workload.py              |  5 +-
 .../regress/test_timeline_detach_ancestor.py  | 54 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 98330ba350..4df624def3 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -343,7 +343,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json["tenant_shards"], list)
         return res_json
 
-    def tenant_get_location(self, tenant_id: TenantShardId):
+    def tenant_get_location(self, tenant_id: TenantId | TenantShardId):
         res = self.get(
             f"http://localhost:{self.port}/v1/location_config/{tenant_id}",
         )
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 72dc102538..4c6b2b6b3e 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -94,9 +94,10 @@ class Workload:
     def __del__(self):
         self.stop()
 
-    def init(self, pageserver_id: int | None = None):
+    def init(self, pageserver_id: int | None = None, allow_recreate=False):
         endpoint = self.endpoint(pageserver_id)
-
+        if allow_recreate:
+            endpoint.safe_psql(f"DROP TABLE IF EXISTS {self.table};")
         endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
         endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
         last_flush_lsn_upload(
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 1547ebc35d..cd4e0a5f3b 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -23,7 +23,8 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.utils import assert_pageserver_backups_equal, wait_until
+from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until
+from fixtures.workload import Workload
 from requests import ReadTimeout
 
 
@@ -1550,6 +1551,57 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
     env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBuilder):
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": f"{1024 ** 2}",
+        "lsn_lease_length": "0s",
+        # Small checkpoint distance to create many layers
+        "checkpoint_distance": 1024**2,
+        # Compact small layers
+        "compaction_target_size": 1024**2,
+        "image_creation_threshold": 2,
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 50
+
+    ps_http = env.pageserver.http_client()
+
+    workload_parent = Workload(env, tenant_id, timeline_id)
+    workload_parent.init(env.pageserver.id)
+    log.info("Writing initial data ...")
+    workload_parent.write_rows(row_count, env.pageserver.id)
+    branch_id = env.create_branch("child")
+    workload_child = Workload(env, tenant_id, branch_id, branch_name="child")
+    workload_child.init(env.pageserver.id, allow_recreate=True)
+    log.info("Writing initial data on child...")
+    workload_child.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload_parent.churn_rows(row_count, env.pageserver.id)
+        workload_child.churn_rows(row_count, env.pageserver.id)
+
+    ps_http.detach_ancestor(tenant_id, branch_id)
+
+    log.info("Validating at workload end ...")
+    workload_parent.validate(env.pageserver.id)
+    workload_child.validate(env.pageserver.id)
+
+
 # TODO:
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.