mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 01:42:55 +00:00
fixes https://github.com/neondatabase/neon/issues/7790 (duplicating most of the issue description here for posterity) # Background From the time before always-authoritative `index_part.json`, we had to handle duplicate layers. See the RFC for an illustration of how duplicate layers could happen:a8e6d259cb/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md (L41-L50)As of #5198 , we should not be exposed to that problem anymore. # Problem 1 We still have 1. [code in Pageserver](82960b2175/pageserver/src/tenant/timeline.rs (L4502-L4521)) than handles duplicate layers 2. [tests in the test suite](d9dcbffac3/test_runner/regress/test_duplicate_layers.py (L15)) that demonstrates the problem using a failpoint However, the test in the test suite doesn't use the failpoint to induce a crash that could legitimately happen in production. What is does instead is to return early with an `Ok()`, so that the code in Pageserver that handles duplicate layers (item 1) actually gets exercised. That "return early" would be a bug in the routine if it happened in production. So, the tests in the test suite are tests for their own sake, but don't serve to actually regress-test any production behavior. # Problem 2 Further, if production code _did_ (it nowawdays doesn't!) create a duplicate layer, the code in Pageserver that handles the condition (item 1 above) is too little and too late: * the code handles it by discarding the newer `struct Layer`; that's good. * however, on disk, we have already overwritten the old with the new layer file * the fact that we do it atomically doesn't matter because ... * if the new layer file is not bit-identical, then we have a cache coherency problem * PS PageCache block cache: caches old bit battern * blob_io offsets stored in variables, based on pre-overwrite bit pattern / offsets * => reading based on these offsets from the new file might yield different data than before # Solution - Remove the test suite code pertaining to Problem 1 - Move & rename test suite code that actually tests RFC-27 crash-consistent layer map. - Remove the Pageserver code that handles duplicate layers too late (Problem 1) - Use `RENAME_NOREPLACE` to prevent over-rename the file during `.finish()`, bail with an error if it happens (Problem 2) - This bailing prevents the caller from even trying to insert into the layer map, as they don't even get a `struct Layer` at hand. - Add `abort`s in the place where we have the layer map lock and check for duplicates (Problem 2) - Note again, we can't reach there because we bail from `.finish()` much earlier in the code. - Share the logic to clean up after failed `.finish()` between image layers and delta layers (drive-by cleanup) - This exposed that test `image_layer_rewrite` was overwriting layer files in place. Fix the test. # Future Work This PR adds a new failure scenario that was previously "papered over" by the overwriting of layers: 1. Start a compaction that will produce 3 layers: A, B, C 2. Layer A is `finish()`ed successfully. 3. Layer B fails mid-way at some `put_value()`. 4. Compaction bails out, sleeps 20s. 5. Some disk space gets freed in the meantime. 6. Compaction wakes from sleep, another iteration starts, it attempts to write Layer A again. But the `.finish()` **fails because A already exists on disk**. The failure in step 5 is new with this PR, and it **causes the compaction to get stuck**. Before, it would silently overwrite the file and "successfully" complete the second iteration. The mitigation for this is to `/reset` the tenant.
221 lines
8.3 KiB
Python
221 lines
8.3 KiB
Python
import random
|
|
from contextlib import closing
|
|
from typing import Optional
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
|
from fixtures.remote_storage import s3_storage
|
|
from fixtures.utils import wait_until
|
|
|
|
|
|
# Test restarting page server, while safekeeper and compute node keep
|
|
# running.
|
|
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
neon_env_builder.enable_scrub_on_exit()
|
|
|
|
env = neon_env_builder.init_start()
|
|
|
|
endpoint = env.endpoints.create_start("main")
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
assert (
|
|
pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"})
|
|
== 1
|
|
)
|
|
|
|
pg_conn = endpoint.connect()
|
|
cur = pg_conn.cursor()
|
|
|
|
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
|
# shared_buffers, otherwise the SELECT after restart will just return answer
|
|
# from shared_buffers without hitting the page server, which defeats the point
|
|
# of this test.
|
|
cur.execute("CREATE TABLE foo (t text)")
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO foo
|
|
SELECT 'long string to consume some space' || g
|
|
FROM generate_series(1, 100000) g
|
|
"""
|
|
)
|
|
|
|
# Verify that the table is larger than shared_buffers
|
|
cur.execute(
|
|
"""
|
|
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
|
|
from pg_settings where name = 'shared_buffers'
|
|
"""
|
|
)
|
|
row = cur.fetchone()
|
|
assert row is not None
|
|
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
|
assert int(row[0]) < int(row[1])
|
|
|
|
# Stop the pageserver gracefully and restart it.
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
# We reloaded our tenant
|
|
assert (
|
|
pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"})
|
|
== 1
|
|
)
|
|
|
|
cur.execute("SELECT count(*) FROM foo")
|
|
assert cur.fetchone() == (100000,)
|
|
|
|
# Restart the server again, but delay the loading of tenants, and test what the
|
|
# pageserver does if a compute node connects and sends a request for the tenant
|
|
# while it's still in Loading state. (It waits for the loading to finish, and then
|
|
# processes the request.)
|
|
tenant_load_delay_ms = 5000
|
|
env.pageserver.stop()
|
|
env.pageserver.start(
|
|
extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"}
|
|
)
|
|
|
|
# Check that it's in Attaching state
|
|
client = env.pageserver.http_client()
|
|
tenant_status = client.tenant_status(env.initial_tenant)
|
|
log.info("Tenant status : %s", tenant_status)
|
|
assert tenant_status["state"]["slug"] == "Attaching"
|
|
|
|
# Try to read. This waits until the loading finishes, and then return normally.
|
|
cur.execute("SELECT count(*) FROM foo")
|
|
assert cur.fetchone() == (100000,)
|
|
|
|
# Wait for metrics to indicate startup complete, so that we can know all
|
|
# startup phases will be reflected in the subsequent checks
|
|
def assert_complete():
|
|
for sample in pageserver_http.get_metrics().query_all(
|
|
"pageserver_startup_duration_seconds"
|
|
):
|
|
labels = dict(sample.labels)
|
|
log.info(f"metric {labels['phase']}={sample.value}")
|
|
if labels["phase"] == "complete" and sample.value > 0:
|
|
return
|
|
|
|
raise AssertionError("No 'complete' metric yet")
|
|
|
|
wait_until(30, 1.0, assert_complete)
|
|
|
|
# Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
|
|
expectations = [
|
|
(
|
|
"initial",
|
|
lambda t, p: True,
|
|
), # make no assumptions about the initial time point, it could be 0 in theory
|
|
# Remote phase of initial_tenant_load should happen before overall phase is complete
|
|
("initial_tenant_load_remote", lambda t, p: t >= 0.0 and t >= p),
|
|
# Initial tenant load should reflect the delay we injected
|
|
("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p),
|
|
# Subsequent steps should occur in expected order
|
|
("background_jobs_can_start", lambda t, p: t > 0 and t >= p),
|
|
("complete", lambda t, p: t > 0 and t >= p),
|
|
]
|
|
|
|
# Accumulate the runtime of each startup phase
|
|
values = {}
|
|
metrics = pageserver_http.get_metrics()
|
|
prev_value = None
|
|
for sample in metrics.query_all("pageserver_startup_duration_seconds"):
|
|
phase = sample.labels["phase"]
|
|
log.info(f"metric {phase}={sample.value}")
|
|
assert phase in [e[0] for e in expectations], f"Unexpected phase {phase}"
|
|
values[phase] = sample
|
|
|
|
# Apply expectations to the metrics retrieved
|
|
for phase, expectation in expectations:
|
|
assert phase in values, f"No data for phase {phase}"
|
|
sample = values[phase]
|
|
assert expectation(
|
|
sample.value, prev_value
|
|
), f"Unexpected value for {phase}: {sample.value}"
|
|
prev_value = sample.value
|
|
|
|
# Startup is complete, this metric should exist but be zero
|
|
assert metrics.query_one("pageserver_startup_is_loading").value == 0
|
|
|
|
# This histogram should have been populated, although we aren't specific about exactly
|
|
# which bucket values: just nonzero
|
|
assert any(
|
|
bucket.value > 0
|
|
for bucket in metrics.query_all("pageserver_tenant_activation_seconds_bucket")
|
|
)
|
|
|
|
|
|
# Test that repeatedly kills and restarts the page server, while the
|
|
# safekeeper and compute node keep running.
|
|
@pytest.mark.timeout(540)
|
|
@pytest.mark.parametrize("shard_count", [None, 4])
|
|
def test_pageserver_chaos(
|
|
neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
|
|
):
|
|
if build_type == "debug":
|
|
pytest.skip("times out in debug builds")
|
|
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
neon_env_builder.enable_scrub_on_exit()
|
|
if shard_count is not None:
|
|
neon_env_builder.num_pageservers = shard_count
|
|
|
|
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
|
|
|
# Use a tiny checkpoint distance, to create a lot of layers quickly.
|
|
# That allows us to stress the compaction and layer flushing logic more.
|
|
tenant, _ = env.neon_cli.create_tenant(
|
|
conf={
|
|
"checkpoint_distance": "5000000",
|
|
}
|
|
)
|
|
env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant)
|
|
endpoint = env.endpoints.create_start("test_pageserver_chaos", tenant_id=tenant)
|
|
|
|
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
|
# shared_buffers, otherwise the SELECT after restart will just return answer
|
|
# from shared_buffers without hitting the page server, which defeats the point
|
|
# of this test.
|
|
with closing(endpoint.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("CREATE TABLE foo (id int, t text, updates int)")
|
|
cur.execute("CREATE INDEX ON foo (id)")
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO foo
|
|
SELECT g, 'long string to consume some space' || g, 0
|
|
FROM generate_series(1, 100000) g
|
|
"""
|
|
)
|
|
|
|
# Verify that the table is larger than shared_buffers
|
|
cur.execute(
|
|
"""
|
|
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
|
|
from pg_settings where name = 'shared_buffers'
|
|
"""
|
|
)
|
|
row = cur.fetchone()
|
|
assert row is not None
|
|
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
|
assert int(row[0]) < int(row[1])
|
|
|
|
# We run "random" kills using a fixed seed, to improve reproducibility if a test
|
|
# failure is related to a particular order of operations.
|
|
seed = 0xDEADBEEF
|
|
rng = random.Random(seed)
|
|
|
|
# Update the whole table, then immediately kill and restart the pageserver
|
|
for i in range(1, 15):
|
|
endpoint.safe_psql("UPDATE foo set updates = updates + 1")
|
|
|
|
# This kills the pageserver immediately, to simulate a crash
|
|
to_kill = rng.choice(env.pageservers)
|
|
to_kill.stop(immediate=True)
|
|
to_kill.start()
|
|
|
|
# Check that all the updates are visible
|
|
num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
|
|
assert num_updates == i * 100000
|