mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 17:02:56 +00:00
fixes https://github.com/neondatabase/neon/issues/7790 (duplicating most of the issue description here for posterity) # Background From the time before always-authoritative `index_part.json`, we had to handle duplicate layers. See the RFC for an illustration of how duplicate layers could happen:a8e6d259cb/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md (L41-L50)As of #5198 , we should not be exposed to that problem anymore. # Problem 1 We still have 1. [code in Pageserver](82960b2175/pageserver/src/tenant/timeline.rs (L4502-L4521)) than handles duplicate layers 2. [tests in the test suite](d9dcbffac3/test_runner/regress/test_duplicate_layers.py (L15)) that demonstrates the problem using a failpoint However, the test in the test suite doesn't use the failpoint to induce a crash that could legitimately happen in production. What is does instead is to return early with an `Ok()`, so that the code in Pageserver that handles duplicate layers (item 1) actually gets exercised. That "return early" would be a bug in the routine if it happened in production. So, the tests in the test suite are tests for their own sake, but don't serve to actually regress-test any production behavior. # Problem 2 Further, if production code _did_ (it nowawdays doesn't!) create a duplicate layer, the code in Pageserver that handles the condition (item 1 above) is too little and too late: * the code handles it by discarding the newer `struct Layer`; that's good. * however, on disk, we have already overwritten the old with the new layer file * the fact that we do it atomically doesn't matter because ... * if the new layer file is not bit-identical, then we have a cache coherency problem * PS PageCache block cache: caches old bit battern * blob_io offsets stored in variables, based on pre-overwrite bit pattern / offsets * => reading based on these offsets from the new file might yield different data than before # Solution - Remove the test suite code pertaining to Problem 1 - Move & rename test suite code that actually tests RFC-27 crash-consistent layer map. - Remove the Pageserver code that handles duplicate layers too late (Problem 1) - Use `RENAME_NOREPLACE` to prevent over-rename the file during `.finish()`, bail with an error if it happens (Problem 2) - This bailing prevents the caller from even trying to insert into the layer map, as they don't even get a `struct Layer` at hand. - Add `abort`s in the place where we have the layer map lock and check for duplicates (Problem 2) - Note again, we can't reach there because we bail from `.finish()` much earlier in the code. - Share the logic to clean up after failed `.finish()` between image layers and delta layers (drive-by cleanup) - This exposed that test `image_layer_rewrite` was overwriting layer files in place. Fix the test. # Future Work This PR adds a new failure scenario that was previously "papered over" by the overwriting of layers: 1. Start a compaction that will produce 3 layers: A, B, C 2. Layer A is `finish()`ed successfully. 3. Layer B fails mid-way at some `put_value()`. 4. Compaction bails out, sleeps 20s. 5. Some disk space gets freed in the meantime. 6. Compaction wakes from sleep, another iteration starts, it attempts to write Layer A again. But the `.finish()` **fails because A already exists on disk**. The failure in step 5 is new with this PR, and it **causes the compaction to get stuck**. Before, it would silently overwrite the file and "successfully" complete the second iteration. The mitigation for this is to `/reset` the tenant.
260 lines
11 KiB
Python
260 lines
11 KiB
Python
import enum
|
|
import json
|
|
import os
|
|
from typing import Optional
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
|
|
from fixtures.pageserver.http import PageserverApiException
|
|
from fixtures.workload import Workload
|
|
|
|
AGGRESIVE_COMPACTION_TENANT_CONF = {
|
|
# Disable gc and compaction. The test runs compaction manually.
|
|
"gc_period": "0s",
|
|
"compaction_period": "0s",
|
|
# Small checkpoint distance to create many layers
|
|
"checkpoint_distance": 1024**2,
|
|
# Compact small layers
|
|
"compaction_target_size": 1024**2,
|
|
"image_creation_threshold": 2,
|
|
}
|
|
|
|
|
|
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
|
|
def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
This is a smoke test that compaction kicks in. The workload repeatedly churns
|
|
a small number of rows and manually instructs the pageserver to run compaction
|
|
between iterations. At the end of the test validate that the average number of
|
|
layers visited to gather reconstruct data for a given key is within the empirically
|
|
observed bounds.
|
|
"""
|
|
|
|
# Effectively disable the page cache to rely only on image layers
|
|
# to shorten reads.
|
|
neon_env_builder.pageserver_config_override = """
|
|
page_cache_size=10
|
|
"""
|
|
|
|
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
|
|
|
|
tenant_id = env.initial_tenant
|
|
timeline_id = env.initial_timeline
|
|
|
|
row_count = 10000
|
|
churn_rounds = 100
|
|
|
|
ps_http = env.pageserver.http_client()
|
|
|
|
workload = Workload(env, tenant_id, timeline_id)
|
|
workload.init(env.pageserver.id)
|
|
|
|
log.info("Writing initial data ...")
|
|
workload.write_rows(row_count, env.pageserver.id)
|
|
|
|
for i in range(1, churn_rounds + 1):
|
|
if i % 10 == 0:
|
|
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
|
|
|
workload.churn_rows(row_count, env.pageserver.id)
|
|
ps_http.timeline_compact(tenant_id, timeline_id)
|
|
|
|
log.info("Validating at workload end ...")
|
|
workload.validate(env.pageserver.id)
|
|
|
|
log.info("Checking layer access metrics ...")
|
|
|
|
layer_access_metric_names = [
|
|
"pageserver_layers_visited_per_read_global_sum",
|
|
"pageserver_layers_visited_per_read_global_count",
|
|
"pageserver_layers_visited_per_read_global_bucket",
|
|
"pageserver_layers_visited_per_vectored_read_global_sum",
|
|
"pageserver_layers_visited_per_vectored_read_global_count",
|
|
"pageserver_layers_visited_per_vectored_read_global_bucket",
|
|
]
|
|
|
|
metrics = env.pageserver.http_client().get_metrics()
|
|
for name in layer_access_metric_names:
|
|
layer_access_metrics = metrics.query_all(name)
|
|
log.info(f"Got metrics: {layer_access_metrics}")
|
|
|
|
non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
|
|
non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
|
|
if non_vectored_count.value != 0:
|
|
non_vectored_average = non_vectored_sum.value / non_vectored_count.value
|
|
else:
|
|
non_vectored_average = 0
|
|
vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
|
|
vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
|
|
if vectored_count.value > 0:
|
|
assert vectored_sum.value > 0
|
|
vectored_average = vectored_sum.value / vectored_count.value
|
|
else:
|
|
# special case: running local tests with default legacy configuration
|
|
assert vectored_sum.value == 0
|
|
vectored_average = 0
|
|
|
|
log.info(f"{non_vectored_average=} {vectored_average=}")
|
|
|
|
# The upper bound for average number of layer visits below (8)
|
|
# was chosen empirically for this workload.
|
|
assert non_vectored_average < 8
|
|
assert vectored_average < 8
|
|
|
|
|
|
# Stripe sizes in number of pages.
|
|
TINY_STRIPES = 16
|
|
LARGE_STRIPES = 32768
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)]
|
|
)
|
|
def test_sharding_compaction(
|
|
neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int]
|
|
):
|
|
"""
|
|
Use small stripes, small layers, and small compaction thresholds to exercise how compaction
|
|
and image layer generation interacts with sharding.
|
|
|
|
We are looking for bugs that might emerge from the way sharding uses sparse layer files that
|
|
only contain some of the keys in the key range covered by the layer, such as errors estimating
|
|
the size of layers that might result in too-small layer files.
|
|
"""
|
|
|
|
compaction_target_size = 128 * 1024
|
|
|
|
TENANT_CONF = {
|
|
# small checkpointing and compaction targets to ensure we generate many upload operations
|
|
"checkpoint_distance": f"{128 * 1024}",
|
|
"compaction_threshold": "1",
|
|
"compaction_target_size": f"{compaction_target_size}",
|
|
# no PITR horizon, we specify the horizon when we request on-demand GC
|
|
"pitr_interval": "0s",
|
|
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
|
"gc_period": "0s",
|
|
"compaction_period": "0s",
|
|
# create image layers eagerly: we want to exercise image layer creation in this test.
|
|
"image_creation_threshold": "1",
|
|
"image_layer_creation_check_threshold": 0,
|
|
}
|
|
|
|
neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf=TENANT_CONF,
|
|
initial_tenant_shard_count=shard_count,
|
|
initial_tenant_shard_stripe_size=stripe_size,
|
|
)
|
|
|
|
tenant_id = env.initial_tenant
|
|
timeline_id = env.initial_timeline
|
|
|
|
workload = Workload(env, tenant_id, timeline_id)
|
|
workload.init()
|
|
workload.write_rows(64)
|
|
for _i in range(0, 10):
|
|
# Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
|
|
# these should result in image layers each time we write some data into a shard, and also shards
|
|
# recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
|
|
# rather than asserting)
|
|
workload.churn_rows(64)
|
|
|
|
# Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes
|
|
# to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job.
|
|
shard_has_image_layers = []
|
|
for shard in env.storage_controller.locate(tenant_id):
|
|
pageserver = env.get_pageserver(shard["node_id"])
|
|
shard_id = shard["shard_id"]
|
|
layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
|
|
image_layer_sizes = {}
|
|
for layer in layer_map.historic_layers:
|
|
if layer.kind == "Image":
|
|
image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
|
|
|
|
# Pageserver should assert rather than emit an empty layer file, but double check here
|
|
assert layer.layer_file_size > 0
|
|
|
|
shard_has_image_layers.append(len(image_layer_sizes) > 1)
|
|
log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}")
|
|
|
|
if stripe_size == TINY_STRIPES:
|
|
# Checking the average size validates that our keyspace partitioning is properly respecting sharding: if
|
|
# it was not, we would tend to get undersized layers because the partitioning would overestimate the physical
|
|
# data in a keyrange.
|
|
#
|
|
# We only do this check with tiny stripes, because large stripes may not give all shards enough
|
|
# data to have statistically significant image layers
|
|
avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)
|
|
log.info(f"Shard {shard_id} average image layer size: {avg_size}")
|
|
assert avg_size > compaction_target_size / 2
|
|
|
|
if stripe_size == TINY_STRIPES:
|
|
# Expect writes were scattered across all pageservers: they should all have compacted some image layers
|
|
assert all(shard_has_image_layers)
|
|
else:
|
|
# With large stripes, it is expected that most of our writes went to one pageserver, so we just require
|
|
# that at least one of them has some image layers.
|
|
assert any(shard_has_image_layers)
|
|
|
|
# Assert that everything is still readable
|
|
workload.validate()
|
|
|
|
|
|
class CompactionAlgorithm(str, enum.Enum):
|
|
LEGACY = "legacy"
|
|
TIERED = "tiered"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED]
|
|
)
|
|
def test_uploads_and_deletions(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
compaction_algorithm: CompactionAlgorithm,
|
|
):
|
|
"""
|
|
:param compaction_algorithm: the compaction algorithm to use.
|
|
"""
|
|
|
|
tenant_conf = {
|
|
# small checkpointing and compaction targets to ensure we generate many upload operations
|
|
"checkpoint_distance": f"{128 * 1024}",
|
|
"compaction_threshold": "1",
|
|
"compaction_target_size": f"{128 * 1024}",
|
|
# no PITR horizon, we specify the horizon when we request on-demand GC
|
|
"pitr_interval": "0s",
|
|
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
|
"gc_period": "0s",
|
|
"compaction_period": "0s",
|
|
# create image layers eagerly, so that GC can remove some layers
|
|
"image_creation_threshold": "1",
|
|
"image_layer_creation_check_threshold": "0",
|
|
"compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}),
|
|
}
|
|
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
|
|
|
|
# TODO remove these allowed errors
|
|
# https://github.com/neondatabase/neon/issues/7707
|
|
# https://github.com/neondatabase/neon/issues/7759
|
|
allowed_errors = [
|
|
".*/checkpoint.*rename temporary file as correct path for.*", # EEXIST
|
|
".*delta layer created with.*duplicate values.*",
|
|
".*assertion failed: self.lsn_range.start <= lsn.*",
|
|
".*HTTP request handler task panicked: task.*panicked.*",
|
|
]
|
|
if compaction_algorithm == CompactionAlgorithm.TIERED:
|
|
env.pageserver.allowed_errors.extend(allowed_errors)
|
|
|
|
try:
|
|
generate_uploads_and_deletions(env, pageserver=env.pageserver)
|
|
except PageserverApiException as e:
|
|
log.info(f"Obtained PageserverApiException: {e}")
|
|
|
|
# The errors occur flakily and no error is ensured to occur,
|
|
# however at least one of them occurs.
|
|
if compaction_algorithm == CompactionAlgorithm.TIERED:
|
|
found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors)
|
|
if not found_allowed_error:
|
|
raise Exception("None of the allowed_errors occured in the log")
|