test(pageserver): add detach ancestor smoke test (#9842)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/9682, hopefully
we can detect some issues or assure ourselves that this is ready for
production.

## Summary of changes

* Add a compaction-detach-ancestor smoke test.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2024-11-22 13:21:51 -05:00
committed by GitHub
parent e939d36dd4
commit 6f8b1eb5a6
3 changed files with 57 additions and 4 deletions

View File

@@ -343,7 +343,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
assert isinstance(res_json["tenant_shards"], list)
return res_json
def tenant_get_location(self, tenant_id: TenantShardId):
def tenant_get_location(self, tenant_id: TenantId | TenantShardId):
res = self.get(
f"http://localhost:{self.port}/v1/location_config/{tenant_id}",
)

View File

@@ -94,9 +94,10 @@ class Workload:
def __del__(self):
self.stop()
def init(self, pageserver_id: int | None = None):
def init(self, pageserver_id: int | None = None, allow_recreate=False):
endpoint = self.endpoint(pageserver_id)
if allow_recreate:
endpoint.safe_psql(f"DROP TABLE IF EXISTS {self.table};")
endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
last_flush_lsn_upload(

View File

@@ -23,7 +23,8 @@ from fixtures.neon_fixtures import (
from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.utils import assert_pageserver_backups_equal, wait_until
from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until
from fixtures.workload import Workload
from requests import ReadTimeout
@@ -1550,6 +1551,57 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
@skip_in_debug_build("only run with release build")
def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBuilder):
SMOKE_CONF = {
# Run both gc and gc-compaction.
"gc_period": "5s",
"compaction_period": "5s",
# No PiTR interval and small GC horizon
"pitr_interval": "0s",
"gc_horizon": f"{1024 ** 2}",
"lsn_lease_length": "0s",
# Small checkpoint distance to create many layers
"checkpoint_distance": 1024**2,
# Compact small layers
"compaction_target_size": 1024**2,
"image_creation_threshold": 2,
}
env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
row_count = 10000
churn_rounds = 50
ps_http = env.pageserver.http_client()
workload_parent = Workload(env, tenant_id, timeline_id)
workload_parent.init(env.pageserver.id)
log.info("Writing initial data ...")
workload_parent.write_rows(row_count, env.pageserver.id)
branch_id = env.create_branch("child")
workload_child = Workload(env, tenant_id, branch_id, branch_name="child")
workload_child.init(env.pageserver.id, allow_recreate=True)
log.info("Writing initial data on child...")
workload_child.write_rows(row_count, env.pageserver.id)
for i in range(1, churn_rounds + 1):
if i % 10 == 0:
log.info(f"Running churn round {i}/{churn_rounds} ...")
workload_parent.churn_rows(row_count, env.pageserver.id)
workload_child.churn_rows(row_count, env.pageserver.id)
ps_http.detach_ancestor(tenant_id, branch_id)
log.info("Validating at workload end ...")
workload_parent.validate(env.pageserver.id)
workload_child.validate(env.pageserver.id)
# TODO:
# - branch near existing L1 boundary, image layers?
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.