diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index f1e3d1a309..c7cea4ec04 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -117,6 +117,9 @@ class LayerMapInfo: def image_layers(self) -> List[HistoricLayerInfo]: return [x for x in self.historic_layers if x.kind == "Image"] + def delta_l0_layers(self) -> List[HistoricLayerInfo]: + return [x for x in self.historic_layers if x.kind == "Delta" and x.l0] + def historic_by_name(self) -> Set[str]: return set(x.layer_file_name for x in self.historic_layers) diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 326c4f5c6f..077b76104c 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -2,6 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log from fixtures.neon_fixtures import wait_for_last_flush_lsn @@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare): pageserver_http.timeline_compact(tenant_id, timeline_id) neon_compare.report_size() + + +def test_compaction_l0_memory(neon_compare: NeonCompare): + """ + Generate a large stack of L0s pending compaction into L1s, and + measure the pageserver's peak RSS while doing so + """ + + env = neon_compare.env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # Initially disable compaction so that we will build up a stack of L0s + "compaction_period": "0s", + "gc_period": "0s", + } + ) + neon_compare.tenant = tenant_id + neon_compare.timeline = timeline_id + + endpoint = env.endpoints.create_start( + "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"] + ) + + # Read tenant effective config and assert on checkpoint_distance and compaction_threshold, + # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them. + # + # If these assertions fail, it probably means we changed the default. + tenant_conf = pageserver_http.tenant_config(tenant_id) + assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024 + assert tenant_conf.effective_config["compaction_threshold"] == 10 + + # Aim to write about 20 L0s, so that we will hit the limit on how many + # to compact at once + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + for i in range(200): + cur.execute(f"create table tbl{i} (i int, j int);") + cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);") + for j in range(100): + cur.execute(f"update tbl{i} set j = {j};") + + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop() + + # Check we have generated the L0 stack we expected + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + initial_l0s = len(layers.delta_l0_layers()) + initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})") + + def rss_hwm(): + v = pageserver_http.get_metric_value("libmetrics_maxrss_kb") + assert v is not None + assert v > 0 + return v * 1024 + + before = rss_hwm() + pageserver_http.timeline_compact(tenant_id, timeline_id) + after = rss_hwm() + + log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})") + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers()) + log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})") + + assert after > before # If we didn't use some memory the test is probably buggy + compaction_mapped_rss = after - before + + # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some, + # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate + # repeated references to the same key. + # + # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which + # this memory estimate can be revised far downwards to something that doesn't scale + # linearly with the layer sizes. + MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25 + + # If we find that compaction is using more memory, this may indicate a regression + assert compaction_mapped_rss < MEMORY_ESTIMATE + + # If we find that compaction is using <0.5 the expected memory then: + # - maybe we made a big efficiency improvement, in which case update the test + # - maybe something is functionally wrong with the test and it's not driving the system as expected + assert compaction_mapped_rss > MEMORY_ESTIMATE / 2 + + # We should have compacted some but not all of the l0s, based on the limit on how much + # l0 to compact in one go + assert len(layers.delta_l0_layers()) > 0 + assert len(layers.delta_l0_layers()) < initial_l0s + + # The pageserver should have logged when it hit the compaction size limit + env.pageserver.assert_log_contains(".*hit max delta layer size limit.*")