mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
## Problem We are currently supporting two read paths. No bueno. ## Summary of changes High level: use vectored read path to serve get page requests - gated by `get_impl` config Low level: 1. Add ps config, `get_impl` to specify which read path to use when serving get page requests 2. Fix base cached image handling for the vectored read path. This was subtly broken: previously we would not mark keys that went past their cached lsn as complete. This is a self standing change which could be its own PR, but I've included it here because writing separate tests for it is tricky. 3. Fork get page to use either the legacy or vectored implementation 4. Validate the use of vectored read path when serving get page requests against the legacy implementation. Controlled by `validate_vectored_get` ps config. 5. Use the vectored read path to serve get page requests in tests (with validation). ## Note Since the vectored read path does not go through the page cache to read buffers, this change also amounts to a removal of the buffer page cache. Materialized page cache is still used.
238 lines
9.7 KiB
Python
238 lines
9.7 KiB
Python
import concurrent.futures
|
|
import os
|
|
from typing import List, Tuple
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import (
|
|
Endpoint,
|
|
NeonEnv,
|
|
NeonEnvBuilder,
|
|
wait_for_last_flush_lsn,
|
|
)
|
|
from fixtures.pg_version import PgVersion
|
|
from fixtures.types import TenantId, TimelineId
|
|
|
|
|
|
# Test restarting page server, while safekeeper and compute node keep
|
|
# running.
|
|
def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
|
if neon_env_builder.pageserver_get_impl == "vectored":
|
|
reconstruct_function_name = "get_values_reconstruct_data"
|
|
else:
|
|
reconstruct_function_name = "get_value_reconstruct_data"
|
|
|
|
env = neon_env_builder.init_start()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
f".*{reconstruct_function_name} for layer .*",
|
|
".*could not find data for key.*",
|
|
".*is not active. Current state: Broken.*",
|
|
".*will not become active. Current state: Broken.*",
|
|
".*failed to load metadata.*",
|
|
".*load failed.*load local timeline.*",
|
|
".*layer loading failed permanently: load layer: .*",
|
|
]
|
|
)
|
|
|
|
tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []
|
|
|
|
for _ in range(3):
|
|
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
|
|
|
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
|
with endpoint.cursor() as cur:
|
|
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
|
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
|
|
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
|
endpoint.stop()
|
|
tenant_timelines.append((tenant_id, timeline_id, endpoint))
|
|
|
|
# Stop the pageserver -- this has to be not immediate or we need to wait for uploads
|
|
env.pageserver.stop()
|
|
|
|
# Leave the first timeline alone, but corrupt the others in different ways
|
|
(tenant0, timeline0, pg0) = tenant_timelines[0]
|
|
log.info(f"Timeline {tenant0}/{timeline0} is left intact")
|
|
|
|
(tenant1, timeline1, pg1) = tenant_timelines[1]
|
|
metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
|
|
with open(metadata_path, "w") as f:
|
|
f.write("overwritten with garbage!")
|
|
log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
|
|
|
|
(tenant2, timeline2, pg2) = tenant_timelines[2]
|
|
timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
|
|
for filename in os.listdir(timeline_path):
|
|
if filename.startswith("00000"):
|
|
# Looks like a layer file. Corrupt it
|
|
p = f"{timeline_path}/{filename}"
|
|
size = os.path.getsize(p)
|
|
with open(p, "wb") as f:
|
|
f.truncate(0)
|
|
f.truncate(size)
|
|
log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
|
|
|
|
env.pageserver.start()
|
|
|
|
# Un-damaged tenant works
|
|
pg0.start()
|
|
assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
|
|
|
# Tenant with corrupt local metadata works: remote storage is authoritative for metadata
|
|
pg1.start()
|
|
assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
|
|
|
# Second timeline will fail during basebackup, because the local layer file is corrupt.
|
|
# It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
|
|
# (We don't check layer file contents on startup, when loading the timeline)
|
|
#
|
|
# This will change when we implement checksums for layers
|
|
with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
|
|
pg2.start()
|
|
log.info(
|
|
f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
|
|
)
|
|
|
|
|
|
def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id
|
|
)
|
|
for i in range(4)
|
|
]
|
|
for future in futures:
|
|
future.result()
|
|
|
|
|
|
def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
|
|
".*Timeline got dropped without initializing, cleaning its files.*",
|
|
]
|
|
)
|
|
|
|
tenant_id = env.initial_tenant
|
|
|
|
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
|
|
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
|
|
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
|
_ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
|
|
|
|
# Restart the page server
|
|
env.pageserver.restart(immediate=True)
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|
|
|
|
|
|
# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups
|
|
@pytest.mark.parametrize("exit_or_return", ["return", "exit"])
|
|
def test_timeline_init_break_before_checkpoint_recreate(
|
|
neon_env_builder: NeonEnvBuilder, exit_or_return: str
|
|
):
|
|
env = neon_env_builder.init_configs()
|
|
env.start()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
|
|
".*Timeline got dropped without initializing, cleaning its files.*",
|
|
".*Failed to load index_part from remote storage, failed creation?.*",
|
|
]
|
|
)
|
|
|
|
env.neon_cli.create_tenant(env.initial_tenant)
|
|
tenant_id = env.initial_tenant
|
|
|
|
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Some fixed timeline ID (like control plane does)
|
|
timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")
|
|
|
|
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
|
|
failpoint = "before-checkpoint-new-timeline"
|
|
pattern = failpoint
|
|
if exit_or_return == "exit":
|
|
# in reality a read error happens, but there are automatic retries which now fail because pageserver is dead
|
|
pattern = "Connection aborted."
|
|
|
|
pageserver_http.configure_failpoints((failpoint, exit_or_return))
|
|
with pytest.raises(Exception, match=pattern):
|
|
_ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)
|
|
|
|
# Restart the page server (with the failpoint disabled)
|
|
env.pageserver.restart(immediate=True)
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|
|
|
|
# creating the branch should have worked now
|
|
new_timeline_id = TimelineId(
|
|
pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"]
|
|
)
|
|
|
|
assert timeline_id == new_timeline_id
|
|
|
|
|
|
def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
tenant_id = env.initial_tenant
|
|
|
|
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint when creating a new timeline, right after creating its directory
|
|
pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return"))
|
|
with pytest.raises(Exception, match="after-timeline-dir-creation"):
|
|
_ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
# "New" timeline is not present in the list, allowing pageserver to retry the same request
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|