mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 18:10:37 +00:00
Merge remote-tracking branch 'origin/main' into problame/benchmarking/pr/python-perftest
Conflicts: pageserver/src/tenant.rs test_runner/fixtures/neon_fixtures.py test_runner/fixtures/overlayfs.py test_runner/regress/test_tenant_detach.py In neon_fixtures.py, retain some of our version of things, I have something in git stash that won't apply otherwise.
This commit is contained in:
@@ -41,6 +41,7 @@ from psycopg2.extensions import make_dsn, parse_dsn
|
||||
from typing_extensions import Literal
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures import overlayfs
|
||||
from fixtures.broker import NeonBroker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.allowed_errors import (
|
||||
@@ -553,7 +554,7 @@ class NeonEnvBuilder:
|
||||
tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"
|
||||
|
||||
log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
|
||||
if not self.test_overlay_dir:
|
||||
if self.test_overlay_dir is None:
|
||||
shutil.copytree(tenants_from_dir, tenants_to_dir)
|
||||
else:
|
||||
self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir)
|
||||
@@ -565,13 +566,16 @@ class NeonEnvBuilder:
|
||||
shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid"))
|
||||
|
||||
shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
|
||||
if not self.test_overlay_dir:
|
||||
if self.test_overlay_dir is None:
|
||||
shutil.copytree(
|
||||
repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
|
||||
)
|
||||
else:
|
||||
self.overlay_mount("local_fs_remote_storage",
|
||||
repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage")
|
||||
self.overlay_mount(
|
||||
"local_fs_remote_storage",
|
||||
repo_dir / "local_fs_remote_storage",
|
||||
self.repo_dir / "local_fs_remote_storage",
|
||||
)
|
||||
|
||||
if (attachments_json := Path(repo_dir / "attachments.json")).exists():
|
||||
shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
|
||||
@@ -1909,18 +1913,24 @@ class NeonPageserver(PgProtocol):
|
||||
return None
|
||||
|
||||
def tenant_attach(
|
||||
self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
config: None | Dict[str, Any] = None,
|
||||
config_null: bool = False,
|
||||
generation: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Tenant attachment passes through here to acquire a generation number before proceeding
|
||||
to call into the pageserver HTTP client.
|
||||
"""
|
||||
client = self.http_client()
|
||||
if generation is None:
|
||||
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
return client.tenant_attach(
|
||||
tenant_id,
|
||||
config,
|
||||
config_null,
|
||||
generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
|
||||
generation=generation,
|
||||
)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId):
|
||||
@@ -3276,9 +3286,9 @@ class S3Scrubber:
|
||||
|
||||
|
||||
def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
|
||||
"""Compute the working directory for an individual test."""
|
||||
"""Compute the path to a working directory for an individual test."""
|
||||
test_name = request.node.name
|
||||
test_dir = top_output_dir / (prefix+test_name.replace("/", "-"))
|
||||
test_dir = top_output_dir / f"{prefix}{test_name.replace('/', '-')}"
|
||||
|
||||
# We rerun flaky tests multiple times, use a separate directory for each run.
|
||||
if (suffix := getattr(request.node, "execution_count", None)) is not None:
|
||||
@@ -3289,10 +3299,19 @@ def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) ->
|
||||
assert isinstance(test_dir, Path)
|
||||
return test_dir
|
||||
|
||||
|
||||
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
"""
|
||||
The working directory for a test.
|
||||
"""
|
||||
return _get_test_dir(request, top_output_dir, "")
|
||||
|
||||
|
||||
def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
"""
|
||||
Directory that contains `upperdir` and `workdir` for overlayfs mounts
|
||||
that a test creates. See `NeonEnvBuilder.overlay_mount`.
|
||||
"""
|
||||
return _get_test_dir(request, top_output_dir, "overlay-")
|
||||
|
||||
def get_test_snapshot_dir_path(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
@@ -3325,8 +3344,12 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
|
||||
# scope. So it uses the get_test_output_dir() function to get the path, and
|
||||
# this fixture ensures that the directory exists. That works because
|
||||
# 'autouse' fixtures are run before other fixtures.
|
||||
#
|
||||
# NB: we request the overlay dir fixture so the fixture does its cleanups
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def test_output_dir(request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path) -> Iterator[Path]:
|
||||
def test_output_dir(
|
||||
request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path
|
||||
) -> Iterator[Path]:
|
||||
"""Create the working directory for an individual test."""
|
||||
|
||||
_ = test_overlay_dir # request it so it can do cleanups
|
||||
@@ -3379,6 +3402,8 @@ def test_snapshot_dir(request: FixtureRequest, top_output_dir: Path, test_overla
|
||||
log.info(f"test_snapshot_dir is {snapshot_dir}")
|
||||
return SnapshotDir(snapshot_dir)
|
||||
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]:
|
||||
"""Create the overlay state directory for an individual test."""
|
||||
|
||||
@@ -1,9 +1,14 @@
|
||||
from typing import Iterator
|
||||
import psutil
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
def iter_mounts_beneath(topdir: Path) -> Iterator[Path]:
|
||||
"""
|
||||
Iterate over the overlayfs mounts beneath the specififed `topdir`.
|
||||
The `topdir` itself isn't considered.
|
||||
"""
|
||||
for part in psutil.disk_partitions(all=True):
|
||||
if part.fstype == "overlay":
|
||||
mountpoint = Path(part.mountpoint)
|
||||
|
||||
@@ -144,8 +144,11 @@ def test_remote_storage_backup_and_restore(
|
||||
# Introduce failpoint in list remote timelines code path to make tenant_attach fail.
|
||||
# This is before the failures injected by test_remote_failures, so it's a permanent error.
|
||||
pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*attach failed.*: storage-sync-list-remote-timelines",
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*attach failed.*: storage-sync-list-remote-timelines",
|
||||
".*Tenant state is Broken: storage-sync-list-remote-timelines.*",
|
||||
]
|
||||
)
|
||||
# Attach it. This HTTP request will succeed and launch a
|
||||
# background task to load the tenant. In that background task,
|
||||
@@ -159,9 +162,13 @@ def test_remote_storage_backup_and_restore(
|
||||
"data": {"reason": "storage-sync-list-remote-timelines"},
|
||||
}
|
||||
|
||||
# Ensure that even though the tenant is broken, we can't attach it again.
|
||||
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
# Ensure that even though the tenant is broken, retrying the attachment fails
|
||||
with pytest.raises(Exception, match="Tenant state is Broken"):
|
||||
# Use same generation as in previous attempt
|
||||
gen_state = env.attachment_service.inspect(tenant_id)
|
||||
assert gen_state is not None
|
||||
generation = gen_state[0]
|
||||
env.pageserver.tenant_attach(tenant_id, generation=generation)
|
||||
|
||||
# Restart again, this implicitly clears the failpoint.
|
||||
# test_remote_failures=1 remains active, though, as it's in the pageserver config.
|
||||
@@ -176,10 +183,8 @@ def test_remote_storage_backup_and_restore(
|
||||
), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
|
||||
env.pageserver.start()
|
||||
|
||||
# Ensure that the pageserver remembers that the tenant was attaching, by
|
||||
# trying to attach it again. It should fail.
|
||||
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
# The attach should have got far enough that it recovers on restart (i.e. tenant's
|
||||
# config was written to local storage).
|
||||
log.info("waiting for tenant to become active. this should be quick with on-demand download")
|
||||
|
||||
wait_until_tenant_active(
|
||||
|
||||
@@ -627,7 +627,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
|
||||
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
|
||||
def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
def test_load_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
@@ -644,25 +644,16 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
):
|
||||
env.pageserver.tenant_load(tenant_id)
|
||||
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"tenant {tenant_id} already exists, state: Active",
|
||||
):
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
|
||||
env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match="tenant directory already exists",
|
||||
):
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
|
||||
def test_ignore_while_attaching(
|
||||
def test_detach_while_activating(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test cancellation behavior for tenants that are stuck somewhere between
|
||||
being attached and reaching Active state.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -684,39 +675,28 @@ def test_ignore_while_attaching(
|
||||
data_secret = "very secret secret"
|
||||
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
|
||||
|
||||
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_before_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
|
||||
# Detach it
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
# And re-attach, but stop attach task_mgr task from completing
|
||||
pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(5000)")])
|
||||
pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(600000)")])
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
# Run ignore on the task, thereby cancelling the attach.
|
||||
# XXX This should take priority over attach, i.e., it should cancel the attach task.
|
||||
# But neither the failpoint, nor the proper remote_timeline_client download functions,
|
||||
# are sensitive to task_mgr::shutdown.
|
||||
# This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
|
||||
# So, for now, effectively, this ignore here will block until attach task completes.
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
|
||||
# Cannot attach it due to some local files existing
|
||||
env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match="tenant directory already exists",
|
||||
):
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
# The tenant is in the Activating state. This should not block us from
|
||||
# shutting it down and detaching it.
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
|
||||
assert len(tenants_after_ignore) + 1 == len(
|
||||
tenants_before_ignore
|
||||
tenants_after_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
assert tenant_id not in tenants_after_detach, "Detached tenant should be missing"
|
||||
assert len(tenants_after_detach) + 1 == len(
|
||||
tenants_before_detach
|
||||
), "Only ignored tenant should be missing"
|
||||
|
||||
# Calling load will bring the tenant back online
|
||||
# Subsequently attaching it again should still work
|
||||
pageserver_http.configure_failpoints([("attach-before-activate-sleep", "off")])
|
||||
env.pageserver.tenant_load(tenant_id)
|
||||
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
endpoint.stop()
|
||||
|
||||
@@ -29,18 +29,13 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
|
||||
initial_tenants = sorted(
|
||||
map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
|
||||
)
|
||||
initial_tenant_dirs = [d for d in tenants_dir.iterdir()]
|
||||
[d for d in tenants_dir.iterdir()]
|
||||
|
||||
neon_simple_env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Failed to create directory structure for tenant .*, cleaning tmp data.*",
|
||||
".*Failed to fsync removed temporary tenant directory .*",
|
||||
]
|
||||
)
|
||||
neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
|
||||
|
||||
pageserver_http = neon_simple_env.pageserver.http_client()
|
||||
pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return"))
|
||||
with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"):
|
||||
pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
|
||||
with pytest.raises(Exception, match="tenant-config-before-write"):
|
||||
_ = neon_simple_env.neon_cli.create_tenant()
|
||||
|
||||
new_tenants = sorted(
|
||||
@@ -48,10 +43,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
|
||||
)
|
||||
assert initial_tenants == new_tenants, "should not create new tenants"
|
||||
|
||||
new_tenant_dirs = [d for d in tenants_dir.iterdir()]
|
||||
assert (
|
||||
new_tenant_dirs == initial_tenant_dirs
|
||||
), "pageserver should clean its temp tenant dirs on tenant creation failure"
|
||||
# Any files left behind on disk during failed creation do not prevent
|
||||
# a retry from succeeding.
|
||||
pageserver_http.configure_failpoints(("tenant-config-before-write", "off"))
|
||||
neon_simple_env.neon_cli.create_tenant()
|
||||
|
||||
|
||||
def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
Reference in New Issue
Block a user