mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 01:50:38 +00:00
pageserver: add InProgress tenant map state, use a sync lock for the map (#5367)
## Problem Follows on from #5299 - We didn't have a generic way to protect a tenant undergoing changes: `Tenant` had states, but for our arbitrary transitions between secondary/attached, we need a general way to say "reserve this tenant ID, and don't allow any other ops on it, but don't try and report it as being in any particular state". - The TenantsMap structure was behind an async RwLock, but it was never correct to hold it across await points: that would block any other changes for all tenants. ## Summary of changes - Add the `TenantSlot::InProgress` value. This means: - Incoming administrative operations on the tenant should retry later - Anything trying to read the live state of the tenant (e.g. a page service reader) should retry later or block. - Store TenantsMap in `std::sync::RwLock` - Provide an extended `get_active_tenant_with_timeout` for page_service to use, which will wait on InProgress slots as well as non-active tenants. Closes: https://github.com/neondatabase/neon/issues/5378 --------- Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
@@ -626,6 +626,8 @@ class NeonEnvBuilder:
|
||||
sk.stop(immediate=True)
|
||||
|
||||
for pageserver in self.env.pageservers:
|
||||
pageserver.assert_no_metric_errors()
|
||||
|
||||
pageserver.stop(immediate=True)
|
||||
|
||||
if self.env.attachment_service is not None:
|
||||
@@ -1784,6 +1786,21 @@ class NeonPageserver(PgProtocol):
|
||||
|
||||
assert not errors
|
||||
|
||||
def assert_no_metric_errors(self):
|
||||
"""
|
||||
Certain metrics should _always_ be zero: they track conditions that indicate a bug.
|
||||
"""
|
||||
if not self.running:
|
||||
log.info(f"Skipping metrics check on pageserver {self.id}, it is not running")
|
||||
return
|
||||
|
||||
for metric in [
|
||||
"pageserver_tenant_manager_unexpected_errors_total",
|
||||
"pageserver_deletion_queue_unexpected_errors_total",
|
||||
]:
|
||||
value = self.http_client().get_metric_value(metric)
|
||||
assert value == 0, f"Nonzero {metric} == {value}"
|
||||
|
||||
def log_contains(self, pattern: str) -> Optional[str]:
|
||||
"""Check that the pageserver log contains a line that matches the given regex"""
|
||||
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
|
||||
|
||||
@@ -134,6 +134,9 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
|
||||
env.neon_cli.pageserver_stop(env.pageserver.id)
|
||||
env.neon_cli.safekeeper_stop()
|
||||
|
||||
# Keep NeonEnv state up to date, it usually owns starting/stopping services
|
||||
env.pageserver.running = False
|
||||
|
||||
# Default start
|
||||
res = env.neon_cli.raw_cli(["start"])
|
||||
res.check_returncode()
|
||||
@@ -155,6 +158,10 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
|
||||
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID)
|
||||
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1)
|
||||
|
||||
# Keep NeonEnv state up to date, it usually owns starting/stopping services
|
||||
env.pageservers[0].running = False
|
||||
env.pageservers[1].running = False
|
||||
|
||||
# Addressing a nonexistent ID throws
|
||||
with pytest.raises(RuntimeError):
|
||||
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100)
|
||||
|
||||
@@ -20,6 +20,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
@@ -52,6 +54,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
# We reloaded our tenant
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
cur.execute("SELECT count(*) FROM foo")
|
||||
assert cur.fetchone() == (100000,)
|
||||
|
||||
|
||||
@@ -63,6 +63,9 @@ def test_tenant_delete_smoke(
|
||||
conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
|
||||
)
|
||||
|
||||
# Default tenant and the one we created
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
|
||||
|
||||
# create two timelines one being the parent of another
|
||||
parent = None
|
||||
for timeline in ["first", "second"]:
|
||||
@@ -88,7 +91,9 @@ def test_tenant_delete_smoke(
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
tenant_path = env.pageserver.tenant_dir(tenant_id)
|
||||
assert not tenant_path.exists()
|
||||
@@ -104,6 +109,9 @@ def test_tenant_delete_smoke(
|
||||
),
|
||||
)
|
||||
|
||||
# Deletion updates the tenant count: the one default tenant remains
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
|
||||
|
||||
|
||||
class Check(enum.Enum):
|
||||
RETRY_WITHOUT_RESTART = enum.auto()
|
||||
|
||||
@@ -26,6 +26,16 @@ from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
from prometheus_client.samples import Sample
|
||||
|
||||
# In tests that overlap endpoint activity with tenant attach/detach, there are
|
||||
# a variety of warnings that the page service may emit when it cannot acquire
|
||||
# an active tenant to serve a request
|
||||
PERMIT_PAGE_SERVICE_ERRORS = [
|
||||
".*page_service.*Tenant .* not found",
|
||||
".*page_service.*Tenant .* is not active",
|
||||
".*page_service.*cancelled",
|
||||
".*page_service.*will not become active.*",
|
||||
]
|
||||
|
||||
|
||||
def do_gc_target(
|
||||
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
@@ -60,12 +70,7 @@ def test_tenant_reattach(
|
||||
# create new nenant
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
with endpoint.cursor() as cur:
|
||||
@@ -235,10 +240,7 @@ def test_tenant_reattach_while_busy(
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
|
||||
@@ -259,7 +261,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# first check for non existing tenant
|
||||
tenant_id = TenantId.generate()
|
||||
@@ -271,19 +273,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
assert excinfo.value.status_code == 404
|
||||
|
||||
# the error will be printed to the log too
|
||||
env.pageserver.allowed_errors.append(".*NotFound: tenant *")
|
||||
|
||||
# create new nenant
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
|
||||
# assert tenant exists on disk
|
||||
assert env.pageserver.tenant_dir(tenant_id).exists()
|
||||
|
||||
@@ -345,12 +337,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
|
||||
# create a new tenant
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# assert tenant exists on disk
|
||||
assert env.pageserver.tenant_dir(tenant_id).exists()
|
||||
@@ -401,12 +388,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
|
||||
# create a new tenant
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# assert tenant exists on disk
|
||||
assert env.pageserver.tenant_dir(tenant_id).exists()
|
||||
@@ -453,12 +435,7 @@ def test_detach_while_attaching(
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||
# shared_buffers, otherwise the SELECT after restart will just return answer
|
||||
@@ -593,12 +570,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
data_id = 1
|
||||
data_secret = "very secret secret"
|
||||
@@ -649,12 +621,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
|
||||
with pytest.raises(
|
||||
@@ -693,12 +660,7 @@ def test_ignore_while_attaching(
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Attempts to connect from compute to pageserver while the tenant is
|
||||
# temporarily detached produces these errors in the pageserver log.
|
||||
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
data_id = 1
|
||||
data_secret = "very secret secret"
|
||||
|
||||
Reference in New Issue
Block a user