mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-29 19:10:38 +00:00
Saw a failure like this, from 'test_tenants_attached_after_download' and 'test_tenant_redownloads_truncated_file_on_startup': > test_runner/fixtures/neon_fixtures.py:1064: in verbose_error > res.raise_for_status() > /github/home/.cache/pypoetry/virtualenvs/neon-_pxWMzVK-py3.9/lib/python3.9/site-packages/requests/models.py:1021: in raise_for_status > raise HTTPError(http_error_msg, response=self) > E requests.exceptions.HTTPError: 404 Client Error: Not Found for url: http://localhost:18150/v1/tenant/2334c9c113a82b5dd1651a0a23c53448/timeline > > The above exception was the direct cause of the following exception: > test_runner/regress/test_tenants_with_remote_storage.py:185: in test_tenants_attached_after_download > restored_timelines = client.timeline_list(tenant_id) > test_runner/fixtures/neon_fixtures.py:1148: in timeline_list > self.verbose_error(res) > test_runner/fixtures/neon_fixtures.py:1070: in verbose_error > raise PageserverApiException(msg) from e > E fixtures.neon_fixtures.PageserverApiException: NotFound: Tenant 2334c9c113a82b5dd1651a0a23c53448 is not active. Current state: Loading These tests starts the pageserver, wait until assert_no_in_progress_downloads_for_tenant says that has_downloads_in_progress is false, and then call timeline_list on the tenant. But has_downloads_in_progress was only returned as true when the tenant was being attached, not when it was being loaded at pageserver startup. Change tenant_status API endpoint (/v1/tenant/:tenant_id) so that it returns has_downloads_in_progress=true also for tenants that are still in Loading state.
185 lines
7.5 KiB
Python
185 lines
7.5 KiB
Python
import concurrent.futures
|
|
import os
|
|
from typing import List, Tuple
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
|
|
from fixtures.types import TenantId, TimelineId
|
|
|
|
|
|
# Test restarting page server, while safekeeper and compute node keep
|
|
# running.
|
|
def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*Failed to load delta layer.*",
|
|
".*could not find data for key.*",
|
|
".*is not active. Current state: Broken.*",
|
|
".*will not become active. Current state: Broken.*",
|
|
".*failed to load metadata.*",
|
|
]
|
|
)
|
|
|
|
tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = []
|
|
|
|
for n in range(4):
|
|
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
|
|
|
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
|
with pg.cursor() as cur:
|
|
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
|
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
|
|
pg.stop()
|
|
tenant_timelines.append((tenant_id, timeline_id, pg))
|
|
|
|
# Stop the pageserver
|
|
env.pageserver.stop()
|
|
|
|
# Leave the first timeline alone, but corrupt the others in different ways
|
|
(tenant0, timeline0, pg0) = tenant_timelines[0]
|
|
log.info(f"Timeline {tenant0}/{timeline0} is left intact")
|
|
|
|
(tenant1, timeline1, pg1) = tenant_timelines[1]
|
|
metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
|
|
f = open(metadata_path, "w")
|
|
f.write("overwritten with garbage!")
|
|
f.close()
|
|
log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
|
|
|
|
(tenant2, timeline2, pg2) = tenant_timelines[2]
|
|
timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/"
|
|
for filename in os.listdir(timeline_path):
|
|
if filename.startswith("00000"):
|
|
# Looks like a layer file. Remove it
|
|
os.remove(f"{timeline_path}/{filename}")
|
|
log.info(
|
|
f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
|
|
)
|
|
|
|
(tenant3, timeline3, pg3) = tenant_timelines[3]
|
|
timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/"
|
|
for filename in os.listdir(timeline_path):
|
|
if filename.startswith("00000"):
|
|
# Looks like a layer file. Corrupt it
|
|
f = open(f"{timeline_path}/{filename}", "w")
|
|
f.write("overwritten with garbage!")
|
|
f.close()
|
|
log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")
|
|
|
|
env.pageserver.start()
|
|
|
|
# Tenant 0 should still work
|
|
pg0.start()
|
|
assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
|
|
|
# But all others are broken
|
|
|
|
# First timeline would not get loaded into pageserver due to corrupt metadata file
|
|
with pytest.raises(
|
|
Exception, match=f"Tenant {tenant1} will not become active. Current state: Broken"
|
|
) as err:
|
|
pg1.start()
|
|
log.info(
|
|
f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
|
|
)
|
|
|
|
# Second timeline has no ancestors, only the metadata file and no layer files
|
|
# This will fail with an error like "extracting base backup failed" and cause
|
|
# "could not find data for key"
|
|
with pytest.raises(Exception, match=".*could not find data for key.*") as err:
|
|
pg2.start()
|
|
log.info(f"As expected, compute startup failed eagerly for timeline with missing layers: {err}")
|
|
|
|
# Third timeline will also fail during basebackup, because the layer file is corrupt.
|
|
# (We don't check layer file contents on startup, when loading the timeline)
|
|
with pytest.raises(Exception, match="Failed to load delta layer") as err:
|
|
pg3.start()
|
|
log.info(
|
|
f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
|
|
)
|
|
|
|
|
|
def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id
|
|
)
|
|
for i in range(4)
|
|
]
|
|
for future in futures:
|
|
future.result()
|
|
|
|
|
|
def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
|
|
".*Timeline got dropped without initializing, cleaning its files.*",
|
|
]
|
|
)
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
|
|
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
|
|
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
|
_ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id)
|
|
|
|
# Restart the page server
|
|
env.neon_cli.pageserver_stop(immediate=True)
|
|
env.neon_cli.pageserver_start()
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|
|
|
|
|
|
def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint when creating a new timeline uninit mark, before any other files were created
|
|
pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
|
|
with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
|
|
_ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id)
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
# "New" timeline is not present in the list, allowing pageserver to retry the same request
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|