From a12072b789a9f250b96886f87e85229e8962adff Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 15 Feb 2024 14:07:57 +0000 Subject: [PATCH] tests: add test_sharding_compaction --- test_runner/regress/test_sharding.py | 80 ++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 9b3a28d6c1..d3550169a3 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,3 +1,4 @@ +import json import os import time from collections import defaultdict @@ -1039,3 +1040,82 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) diff = max_lsn - min_lsn assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" +# Stripe sizes in number of pages. +TINY_STRIPES = 16 +LARGE_STRIPES = 32768 + + +@pytest.mark.parametrize("stripe_size", [TINY_STRIPES, LARGE_STRIPES]) +def test_sharding_compaction(neon_env_builder: NeonEnvBuilder, stripe_size: int): + """ + Use small stripes, small layers, and small compaction thresholds to exercise how compaction + and image layer generation interacts with sharding. + """ + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly: we want to exercise image layer creation in this test. + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": 0, + } + + neon_env_builder.num_pageservers = 4 + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=4, + initial_tenant_shard_stripe_size=stripe_size, + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 10): + # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1, + # these should result in image layers each time we write some data into a shard, and also shards + # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer, + # rather than asserting) + workload.churn_rows(64) + + # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes + # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job. + shard_has_image_layers = [] + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + image_layer_sizes = {} + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_sizes[layer.layer_file_name] = layer.layer_file_size + + # Pageserver should assert rather than emit an empty layer file, but double check here + assert layer.layer_file_size is not None + assert layer.layer_file_size > 0 + + shard_has_image_layers.append(len(image_layer_sizes) > 1) + log.info(f"Shard {shard_id} layer sizes: {json.dumps(image_layer_sizes, indent=2)}") + + # TODO: once keyspace partitioning is updated, assert that layer sizes are as expected + # (see https://github.com/neondatabase/neon/issues/6774) + + if stripe_size == TINY_STRIPES: + # Expect writes were scattered across all pageservers: they should all have compacted some image layers + assert all(shard_has_image_layers) + else: + # With large stripes, it is expected that most of our writes went to one pageserver, so we just require + # that at least one of them has some image layers. + assert any(shard_has_image_layers) + + # Assert that everything is still readable + workload.validate()