tests: update for tenant generations (#5449)

## Problem

Some existing tests are written in a way that's incompatible with tenant
generations.

## Summary of changes

Update all the tests that need updating: this is things like calling
through the NeonPageserver.tenant_attach helper to get a generation
number, instead of calling directly into the pageserver API. There are
various more subtle cases.
This commit is contained in:
John Spray
2023-12-07 12:27:16 +00:00
committed by GitHub
parent f9401fdd31
commit e89e41f8ba
27 changed files with 424 additions and 317 deletions

View File

@@ -201,6 +201,12 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
// TODO(sharding): make this shard-aware
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
let valid = tenant_state.generation == req_tenant.gen;
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {})",
req_tenant.id,
req_tenant.gen,
tenant_state.generation
);
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
valid,
@@ -250,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
tenant_state.pageserver = attach_req.node_id;
let generation = tenant_state.generation;
tracing::info!(
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
attach_req.tenant_id,
tenant_state.generation,
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
);
locked.save().await.map_err(ApiError::InternalServerError)?;
json_response(

View File

@@ -323,6 +323,7 @@ impl TenantConfigRequest {
#[derive(Debug, Deserialize)]
pub struct TenantAttachRequest {
#[serde(default)]
pub config: TenantAttachConfig,
#[serde(default)]
pub generation: Option<u32>,
@@ -330,7 +331,7 @@ pub struct TenantAttachRequest {
/// Newtype to enforce deny_unknown_fields on TenantConfig for
/// its usage inside `TenantAttachRequest`.
#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize, Default)]
#[serde(deny_unknown_fields)]
pub struct TenantAttachConfig {
#[serde(flatten)]

View File

@@ -99,27 +99,35 @@ impl LocalFs {
};
// If we were given a directory, we may use it as our starting point.
// Otherwise, we must go up to the parent directory. This is because
// Otherwise, we must go up to the first ancestor dir that exists. This is because
// S3 object list prefixes can be arbitrary strings, but when reading
// the local filesystem we need a directory to start calling read_dir on.
let mut initial_dir = full_path.clone();
match fs::metadata(full_path.clone()).await {
Ok(meta) => {
if !meta.is_dir() {
loop {
// Did we make it to the root?
if initial_dir.parent().is_none() {
anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
}
match fs::metadata(initial_dir.clone()).await {
Ok(meta) if meta.is_dir() => {
// We found a directory, break
break;
}
Ok(_meta) => {
// It's not a directory: strip back to the parent
initial_dir.pop();
}
}
Err(e) if e.kind() == ErrorKind::NotFound => {
// It's not a file that exists: strip the prefix back to the parent directory
initial_dir.pop();
}
Err(e) => {
// Unexpected I/O error
anyhow::bail!(e)
Err(e) if e.kind() == ErrorKind::NotFound => {
// It's not a file that exists: strip the prefix back to the parent directory
initial_dir.pop();
}
Err(e) => {
// Unexpected I/O error
anyhow::bail!(e)
}
}
}
// Note that Utf8PathBuf starts_with only considers full path segments, but
// object prefixes are arbitrary strings, so we need the strings for doing
// starts_with later.

View File

@@ -312,7 +312,18 @@ impl ListWriter {
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
if attached_gen.previous() == tenant_list.generation {
info!(
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
old_gen=?tenant_list.generation, new_gen=?attached_gen,
"Updating gen on recovered list");
tenant_list.generation = *attached_gen;
} else {
info!(
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
old_gen=?tenant_list.generation, new_gen=?attached_gen,
"Encountered stale generation on recovered list");
}
}
}

View File

@@ -266,9 +266,7 @@ class NeonPageserverHttpClient(requests.Session):
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
res = self.post(
f"http://{self.host}:{self.port}/v1/tenant",
json={
"new_tenant_id": new_tenant_id.hex,
},
json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
)
if res.status_code == 409:

View File

@@ -455,7 +455,7 @@ class NeonEnvBuilder:
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
self.enable_generations = False
self.enable_generations = True
self.scrub_on_exit = False
self.test_output_dir = test_output_dir
@@ -1571,6 +1571,20 @@ class NeonAttachmentService:
)
response.raise_for_status()
def inspect(self, tenant_id: TenantId) -> Optional[tuple[int, int]]:
response = requests.post(
f"{self.env.control_plane_api}/inspect",
json={"tenant_id": str(tenant_id)},
)
response.raise_for_status()
json = response.json()
log.info(f"Response: {json}")
if json["attachment"]:
# Explicit int() to make python type linter happy
return (int(json["attachment"][0]), int(json["attachment"][1]))
else:
return None
def __enter__(self) -> "NeonAttachmentService":
return self
@@ -1769,13 +1783,10 @@ class NeonPageserver(PgProtocol):
Tenant attachment passes through here to acquire a generation number before proceeding
to call into the pageserver HTTP client.
"""
if self.env.attachment_service is not None:
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
else:
generation = None
client = self.http_client()
return client.tenant_attach(tenant_id, config, config_null, generation=generation)
return client.tenant_attach(
tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
)
def tenant_detach(self, tenant_id: TenantId):
if self.env.attachment_service is not None:
@@ -1784,6 +1795,34 @@ class NeonPageserver(PgProtocol):
client = self.http_client()
return client.tenant_detach(tenant_id)
def tenant_create(
self,
tenant_id: TenantId,
conf: Optional[Dict[str, Any]] = None,
auth_token: Optional[str] = None,
) -> TenantId:
client = self.http_client(auth_token=auth_token)
return client.tenant_create(
tenant_id, conf, generation=self.maybe_get_generation(tenant_id)
)
def tenant_load(self, tenant_id: TenantId):
client = self.http_client()
return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
def maybe_get_generation(self, tenant_id: TenantId):
"""
For tests that would like to use an HTTP client directly instead of using
the `tenant_attach` and `tenant_create` helpers here: issue a generation
number for a tenant.
Returns None if the attachment service is not enabled (legacy mode)
"""
if self.env.attachment_service is not None:
return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
else:
return None
def append_pageserver_param_overrides(
params_to_update: List[str],

View File

@@ -210,16 +210,25 @@ class PageserverHttpClient(requests.Session):
return res_json
def tenant_create(
self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
self,
new_tenant_id: TenantId,
conf: Optional[Dict[str, Any]] = None,
generation: Optional[int] = None,
) -> TenantId:
if conf is not None:
assert "new_tenant_id" not in conf.keys()
body: Dict[str, Any] = {
"new_tenant_id": str(new_tenant_id),
**(conf or {}),
}
if generation is not None:
body.update({"generation": generation})
res = self.post(
f"http://localhost:{self.port}/v1/tenant",
json={
"new_tenant_id": str(new_tenant_id),
**(conf or {}),
},
json=body,
)
self.verbose_error(res)
if res.status_code == 409:
@@ -273,8 +282,11 @@ class PageserverHttpClient(requests.Session):
self.verbose_error(res)
return res
def tenant_load(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
def tenant_load(self, tenant_id: TenantId, generation=None):
body = None
if generation is not None:
body = {"generation": generation}
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
self.verbose_error(res)
def tenant_ignore(self, tenant_id: TenantId):

View File

@@ -6,9 +6,8 @@ from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
@dataclass
class IndexLayerMetadata:
@classmethod
def from_json(cls, d: Dict[str, Any]):
return {}
file_size: int
generation: int
@dataclass(frozen=True)
@@ -139,7 +138,7 @@ class IndexPartDump:
def from_json(cls, d: Dict[str, Any]) -> "IndexPartDump":
return IndexPartDump(
layer_metadata={
parse_layer_file_name(n): IndexLayerMetadata.from_json(v)
parse_layer_file_name(n): IndexLayerMetadata(v["file_size"], v["generation"])
for n, v in d["layer_metadata"].items()
},
disk_consistent_lsn=Lsn(d["disk_consistent_lsn"]),

View File

@@ -12,7 +12,6 @@ import boto3
from mypy_boto3_s3 import S3Client
from fixtures.log_helper import log
from fixtures.pageserver.types import LayerFileName
from fixtures.types import TenantId, TimelineId
TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
@@ -88,13 +87,46 @@ class LocalFsStorage:
def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
def layer_path(
self, tenant_id: TenantId, timeline_id: TimelineId, layer_file_name: LayerFileName
):
return self.timeline_path(tenant_id, timeline_id) / layer_file_name.to_str()
def timeline_latest_generation(self, tenant_id, timeline_id):
timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id))
index_parts = [f for f in timeline_files if f.startswith("index_part")]
def parse_gen(filename):
log.info(f"parsing index_part '{filename}'")
parts = filename.split("-")
if len(parts) == 2:
return int(parts[1], 16)
else:
return None
generations = sorted([parse_gen(f) for f in index_parts])
if len(generations) == 0:
raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}")
return generations[-1]
def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME
latest_gen = self.timeline_latest_generation(tenant_id, timeline_id)
if latest_gen is None:
filename = TIMELINE_INDEX_PART_FILE_NAME
else:
filename = f"{TIMELINE_INDEX_PART_FILE_NAME}-{latest_gen:08x}"
return self.timeline_path(tenant_id, timeline_id) / filename
def remote_layer_path(
self,
tenant_id: TenantId,
timeline_id: TimelineId,
local_name: str,
generation: Optional[int] = None,
):
if generation is None:
generation = self.timeline_latest_generation(tenant_id, timeline_id)
assert generation is not None, "Cannot calculate remote layer path without generation"
filename = f"{local_name}-{generation:08x}"
return self.timeline_path(tenant_id, timeline_id) / filename
def index_content(self, tenant_id: TenantId, timeline_id: TimelineId):
with self.index_path(tenant_id, timeline_id).open("r") as f:

View File

@@ -100,7 +100,6 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
env = negative_env.neon_env
tenant_id = negative_env.tenant_id
ps_http = env.pageserver.http_client()
config_with_unknown_keys = {
"compaction_period": "1h",
@@ -108,16 +107,16 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
}
with pytest.raises(PageserverApiException) as e:
ps_http.tenant_attach(tenant_id, config=config_with_unknown_keys)
env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys)
assert e.type == PageserverApiException
assert e.value.status_code == 400
@pytest.mark.parametrize("content_type", [None, "application/json"])
def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
"""
For backwards-compatibility: if we send an empty body,
the request should be accepted and the config should be the default config.
When the 'config' body attribute is omitted, the request should be accepted
and the tenant should use the default configuration
"""
env = positive_env
ps_http = env.pageserver.http_client()
@@ -128,9 +127,14 @@ def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
ps_http.tenant_detach(tenant_id)
assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
body = {}
gen = env.pageserver.maybe_get_generation(tenant_id)
if gen is not None:
body["generation"] = gen
ps_http.post(
f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
data=b"",
json=body,
headers=None if content_type else {"Content-Type": "application/json"},
).raise_for_status()
@@ -191,7 +195,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
}, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
ps_http.tenant_detach(tenant_id)
ps_http.tenant_attach(tenant_id, config=fully_custom_config)
env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set(

View File

@@ -60,14 +60,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
assert_client_authorized(env, invalid_tenant_http_client)
# create tenant using management token
pageserver_http_client.tenant_create(TenantId.generate())
env.pageserver.tenant_create(TenantId.generate(), auth_token=pageserver_token)
# fail to create tenant using tenant token
with pytest.raises(
PageserverApiException,
match="Forbidden: JWT authentication error",
):
tenant_http_client.tenant_create(TenantId.generate())
env.pageserver.tenant_create(TenantId.generate(), auth_token=tenant_token)
def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):

View File

@@ -158,7 +158,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
# pause all uploads
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
ps_http.tenant_create(env.initial_tenant)
env.pageserver.tenant_create(env.initial_tenant)
initial_branch = "initial_branch"
@@ -200,7 +200,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
# pause all uploads
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
ps_http.tenant_create(env.initial_tenant)
env.pageserver.tenant_create(env.initial_tenant)
def start_creating_timeline():
with pytest.raises(RequestException):
@@ -257,7 +257,7 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
# pause all uploads
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
ps_http.tenant_create(env.initial_tenant)
env.pageserver.tenant_create(env.initial_tenant)
def start_creating_timeline():
ps_http.timeline_create(
@@ -343,8 +343,7 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
)
ps_http = env.pageserver.http_client()
# pause all uploads
ps_http.tenant_create(env.initial_tenant)
env.pageserver.tenant_create(env.initial_tenant)
# Create a timeline whose creation will succeed. The tenant will need at least one
# timeline to be loadable.
@@ -397,7 +396,7 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB
)
ps_http = env.pageserver.http_client()
ps_http.tenant_create(env.initial_tenant)
env.pageserver.tenant_create(env.initial_tenant)
ps_http.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
# pause all uploads

View File

@@ -160,7 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
]
)
pageserver_http.tenant_create(env.initial_tenant)
env.pageserver.tenant_create(env.initial_tenant)
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)

View File

@@ -14,6 +14,11 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
)
env = neon_env_builder.init_start()
for pageserver in env.pageservers:
# This test dual-attaches a tenant, one of the pageservers will therefore
# be running with a stale generation.
pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
env.neon_cli.create_branch("test_change_pageserver")
endpoint = env.endpoints.create_start("test_change_pageserver")
@@ -79,6 +84,11 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
# Try failing back, and this time we will stop the current pageserver before reconfiguring
# the endpoint. Whereas the previous reconfiguration was like a healthy migration, this
# is more like what happens in an unexpected pageserver failure.
#
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
# about to start as the attached pageserver
assert env.attachment_service is not None
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
env.pageservers[0].start()
env.pageservers[1].stop()
@@ -88,6 +98,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
assert fetchone() == (100000,)
env.pageservers[0].stop()
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
# about to start as the attached pageserver
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
env.pageservers[1].start()
# Test a (former) bug where a child process spins without updating its connection string

View File

@@ -112,7 +112,9 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
assert l1_found is not None, "failed to find L1 locally"
uploaded = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name
uploaded = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, l1_found.name
)
assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
env.pageserver.start()
@@ -139,4 +141,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
uploaded = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, l1_found.name
)
assert uploaded.exists(), "the L1 is uploaded"

View File

@@ -84,8 +84,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
client = env.pageserver.http_client()
client.tenant_create(tenant)
env.pageserver.tenant_create(tenant)
env.pageserver.allowed_errors.extend(
[
@@ -149,6 +148,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
".*WARN.*ignored .* unexpected bytes after the tar archive.*"
)
client = env.pageserver.http_client()
timeline_delete_wait_completed(client, tenant, timeline)
# Importing correct backup works
@@ -292,7 +292,7 @@ def _import(
# Import to pageserver
endpoint_id = "ep-import_from_pageserver"
client = env.pageserver.http_client()
client.tenant_create(tenant)
env.pageserver.tenant_create(tenant)
env.neon_cli.raw_cli(
[
"timeline",

View File

@@ -149,19 +149,28 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
future_layer_path = env.pageserver_remote_storage.layer_path(
tenant_id, timeline_id, future_layer
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, future_layer.to_str()
)
log.info(f"future layer path: {future_layer_path}")
pre_stat = future_layer_path.stat()
time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites
def get_generation_number():
assert env.attachment_service is not None
attachment = env.attachment_service.inspect(tenant_id)
assert attachment is not None
return attachment[0]
# force removal of layers from the future
tenant_conf = ps_http.tenant_config(tenant_id)
ps_http.tenant_detach(tenant_id)
generation_before_detach = get_generation_number()
env.pageserver.tenant_detach(tenant_id)
failpoint_name = "before-delete-layer-pausable"
ps_http.configure_failpoints((failpoint_name, "pause"))
ps_http.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
generation_after_reattach = get_generation_number()
wait_until_tenant_active(ps_http, tenant_id)
# Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
@@ -177,6 +186,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
wait_until(10, 0.5, delete_at_pause_point)
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
)
log.info(f"future layer path: {future_layer_path}")
assert future_layer_path.exists()
# wait for re-ingestion of the WAL from safekeepers into the in-memory layer
@@ -215,12 +228,17 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
# Examine the resulting S3 state.
log.info("integrity-check the remote storage")
ip = get_index_part()
for layer_file_name in ip.layer_metadata.keys():
layer_path = env.pageserver_remote_storage.layer_path(
tenant_id, timeline_id, layer_file_name
for layer_file_name, layer_metadata in ip.layer_metadata.items():
log.info(f"Layer metadata {layer_file_name.to_str()}: {layer_metadata}")
layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, layer_file_name.to_str(), layer_metadata.generation
)
assert layer_path.exists(), f"{layer_file_name.to_str()}"
log.info("assert that the overwritten layer won")
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, future_layer.to_str(), generation=generation_after_reattach
)
final_stat = future_layer_path.stat()
log.info(f"future layer path: {future_layer_path}")
assert final_stat.st_mtime != pre_stat.st_mtime

View File

@@ -133,6 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
# Stop default ps/sk
env.neon_cli.pageserver_stop(env.pageserver.id)
env.neon_cli.safekeeper_stop()
env.neon_cli.attachment_service_stop(False)
# Keep NeonEnv state up to date, it usually owns starting/stopping services
env.pageserver.running = False
@@ -173,6 +174,9 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1)
env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
# Stop this to get out of the way of the following `start`
env.neon_cli.attachment_service_stop(False)
# Default start
res = env.neon_cli.raw_cli(["start"])
res.check_returncode()

View File

@@ -8,7 +8,6 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pg_version import PgVersion
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
@@ -62,7 +61,10 @@ def test_pageserver_init_node_id(
assert "has node id already, it cannot be overridden" in bad_update.stderr
def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_tenant: TenantId):
def check_client(env: NeonEnv, client: PageserverHttpClient):
pg_version = env.pg_version
initial_tenant = env.initial_tenant
client.check_status()
# check initial tenant is there
@@ -70,7 +72,7 @@ def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_te
# create new tenant and check it is also there
tenant_id = TenantId.generate()
client.tenant_create(tenant_id)
client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
timelines = client.timeline_list(tenant_id)
@@ -181,7 +183,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
check_client(env.pg_version, client, env.initial_tenant)
check_client(env, client)
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
@@ -191,4 +193,4 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
pageserver_token = env.auth_keys.generate_pageserver_token()
with env.pageserver.http_client(auth_token=pageserver_token) as client:
check_client(env.pg_version, client, env.initial_tenant)
check_client(env, client)

View File

@@ -23,7 +23,6 @@ from fixtures.pageserver.utils import (
wait_until_tenant_state,
)
from fixtures.remote_storage import (
TIMELINE_INDEX_PART_FILE_NAME,
LocalFsStorage,
RemoteStorageKind,
available_remote_storages,
@@ -350,6 +349,13 @@ def test_remote_storage_upload_queue_retries(
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()
# We are about to forcibly drop local dirs. Attachment service will increment generation in re-attach before
# we later increment when actually attaching it again, leading to skipping a generation and potentially getting
# these warnings if there was a durable but un-executed deletion list at time of restart.
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -648,7 +654,7 @@ def test_empty_branch_remote_storage_upload(neon_env_builder: NeonEnvBuilder):
), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}"
client.tenant_detach(env.initial_tenant)
client.tenant_attach(env.initial_tenant)
env.pageserver.tenant_attach(env.initial_tenant)
wait_until_tenant_state(client, env.initial_tenant, "Active", 5)
timelines_after_detach = set(
@@ -758,10 +764,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
# this is because creating a timeline always awaits for the uploads to complete
assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
assert (
new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME
assert env.pageserver_remote_storage.index_path(
env.initial_tenant, new_branch_timeline_id
).is_file(), "uploads scheduled during initial load should had been awaited for"
finally:
barrier.abort()
create_thread.join()

View File

@@ -314,7 +314,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
assert not config_path.exists(), "detach did not remove config file"
http_client.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
wait_until(
number_of_iterations=5,
interval=1,

View File

@@ -380,7 +380,7 @@ def test_tenant_delete_is_resumed_on_attach(
env.pageserver.start()
# now we call attach
ps_http.tenant_attach(tenant_id=tenant_id)
env.pageserver.tenant_attach(tenant_id=tenant_id)
# delete should be resumed
wait_tenant_status_404(ps_http, tenant_id, iterations)
@@ -419,7 +419,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
)
pageserver_http.tenant_create(env.initial_tenant)
env.pageserver.tenant_create(env.initial_tenant)
failpoint = "flush-layer-cancel-after-writing-layer-out-pausable"
pageserver_http.configure_failpoints((failpoint, "pause"))

View File

@@ -82,6 +82,10 @@ def test_tenant_reattach(
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
@@ -112,8 +116,8 @@ def test_tenant_reattach(
if mode == ReattachMode.REATTACH_EXPLICIT:
# Explicitly detach then attach the tenant as two separate API calls
pageserver_http.tenant_detach(tenant_id)
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_detach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
# Use the reset API to detach/attach in one shot
pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
@@ -192,6 +196,9 @@ def test_tenant_reattach_while_busy(
updates_finished = 0
updates_to_perform = 0
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
# Run random UPDATEs on test table. On failure, try again.
async def update_table(pg_conn: asyncpg.Connection):
nonlocal updates_started, updates_finished, updates_to_perform
@@ -223,7 +230,7 @@ def test_tenant_reattach_while_busy(
pageserver_http.tenant_detach(tenant_id)
await asyncio.sleep(1)
log.info("Re-attaching tenant")
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
log.info("Re-attach finished")
# Continue with 5000 more updates
@@ -244,9 +251,6 @@ def test_tenant_reattach_while_busy(
assert updates_finished == updates_to_perform
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
# create new nenant
@@ -454,6 +458,10 @@ def test_detach_while_attaching(
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
@@ -487,7 +495,7 @@ def test_detach_while_attaching(
# And re-attach
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
# Before it has chance to finish, detach it again
pageserver_http.tenant_detach(tenant_id)
@@ -497,7 +505,7 @@ def test_detach_while_attaching(
# Attach it again. If the GC and compaction loops from the previous attach/detach
# cycle are still running, things could get really confusing..
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
with endpoint.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM foo")
@@ -556,7 +564,7 @@ def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
), "Ignored tenant should not be reloaded after pageserver restart"
# now, load it from the local files and expect it works
pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
@@ -611,7 +619,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
assert layers_removed, f"Found no layers for tenant {timeline_dir}"
# now, load it from the local files and expect it to work due to remote storage restoration
pageserver_http.tenant_load(tenant_id=tenant_id)
env.pageserver.tenant_load(tenant_id=tenant_id)
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
@@ -645,13 +653,13 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
expected_exception=PageserverApiException,
match=f"tenant {tenant_id} already exists, state: Active",
):
pageserver_http.tenant_load(tenant_id)
env.pageserver.tenant_load(tenant_id)
with pytest.raises(
expected_exception=PageserverApiException,
match=f"tenant {tenant_id} already exists, state: Active",
):
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
pageserver_http.tenant_ignore(tenant_id)
@@ -660,7 +668,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
expected_exception=PageserverApiException,
match="tenant directory already exists",
):
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
def test_ignore_while_attaching(
@@ -679,6 +687,10 @@ def test_ignore_while_attaching(
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
data_id = 1
data_secret = "very secret secret"
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
@@ -689,7 +701,7 @@ def test_ignore_while_attaching(
pageserver_http.tenant_detach(tenant_id)
# And re-attach, but stop attach task_mgr task from completing
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
# Run ignore on the task, thereby cancelling the attach.
# XXX This should take priority over attach, i.e., it should cancel the attach task.
# But neither the failpoint, nor the proper remote_timeline_client download functions,
@@ -704,7 +716,7 @@ def test_ignore_while_attaching(
expected_exception=PageserverApiException,
match="tenant directory already exists",
):
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
@@ -714,7 +726,7 @@ def test_ignore_while_attaching(
# Calling load will bring the tenant back online
pageserver_http.configure_failpoints([("attach-before-activate", "off")])
pageserver_http.tenant_load(tenant_id)
env.pageserver.tenant_load(tenant_id)
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
@@ -818,7 +830,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
found_broken
), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
client.tenant_load(env.initial_tenant)
env.pageserver.tenant_load(env.initial_tenant)
found_active = False
active, broken_set = ([], [])

View File

@@ -7,13 +7,8 @@ from pathlib import Path
from typing import Any, Dict, Optional, Tuple
import pytest
from fixtures.broker import NeonBroker
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
NeonEnvBuilder,
)
from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
assert_tenant_state,
@@ -30,7 +25,6 @@ from fixtures.remote_storage import (
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import (
query_scalar,
start_in_background,
subprocess_capture,
wait_until,
)
@@ -40,58 +34,6 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
assert abs(a - b) / a < margin_ratio, abs(a - b) / a
@contextmanager
def new_pageserver_service(
new_pageserver_dir: Path,
pageserver_bin: Path,
remote_storage_mock_path: Path,
pg_port: int,
http_port: int,
broker: Optional[NeonBroker],
pg_distrib_dir: Path,
):
"""
cannot use NeonPageserver yet because it depends on neon cli
which currently lacks support for multiple pageservers
"""
# actually run new pageserver
cmd = [
str(pageserver_bin),
"--workdir",
str(new_pageserver_dir),
"--update-config",
f"-c listen_pg_addr='localhost:{pg_port}'",
f"-c listen_http_addr='localhost:{http_port}'",
f"-c pg_distrib_dir='{pg_distrib_dir}'",
"-c id=2",
f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}",
]
if broker is not None:
cmd.append(
f"-c broker_endpoint='{broker.client_url()}'",
)
pageserver_client = PageserverHttpClient(
port=http_port,
auth_token=None,
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
)
try:
pageserver_process = start_in_background(
cmd, new_pageserver_dir, "pageserver.log", pageserver_client.check_status
)
except Exception as e:
log.error(e)
pageserver_process.kill()
raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") from e
log.info("new pageserver started")
try:
yield pageserver_process
finally:
log.info("stopping new pageserver")
pageserver_process.kill()
@contextmanager
def pg_cur(endpoint):
with closing(endpoint.connect()) as conn:
@@ -201,7 +143,7 @@ def check_timeline_attached(
def switch_pg_to_new_pageserver(
env: NeonEnv,
origin_ps: NeonPageserver,
endpoint: Endpoint,
new_pageserver_port: int,
tenant_id: TenantId,
@@ -216,7 +158,7 @@ def switch_pg_to_new_pageserver(
endpoint.start()
timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id)
files_before_detach = os.listdir(timeline_to_detach_local_path)
assert (
"metadata" in files_before_detach
@@ -269,27 +211,32 @@ def test_tenant_relocation(
with_load: str,
):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_start()
tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
# FIXME: Is this expected?
env.pageserver.allowed_errors.append(
env.pageservers[0].allowed_errors.append(
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
)
# Needed for detach polling.
env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
# Needed for detach polling on the original pageserver
env.pageservers[0].allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
# We will dual-attach in this test, so stale generations are expected
env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates.*")
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
remote_storage_mock_path = env.pageserver_remote_storage.root
# we use two branches to check that they are both relocated
# first branch is used for load, compute for second one is used to
# check that data is not lost
pageserver_http = env.pageserver.http_client()
origin_ps = env.pageservers[0]
destination_ps = env.pageservers[1]
origin_http = origin_ps.http_client()
destination_http = destination_ps.http_client()
_, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
@@ -302,7 +249,7 @@ def test_tenant_relocation(
timeline_id_main, current_lsn_main = populate_branch(
ep_main,
tenant_id=tenant_id,
ps_http=pageserver_http,
ps_http=origin_http,
create_table=True,
expected_sum=500500,
)
@@ -320,17 +267,17 @@ def test_tenant_relocation(
timeline_id_second, current_lsn_second = populate_branch(
ep_second,
tenant_id=tenant_id,
ps_http=pageserver_http,
ps_http=origin_http,
create_table=False,
expected_sum=1001000,
)
# wait until pageserver receives that data
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main)
timeline_detail_main = pageserver_http.timeline_detail(tenant_id, timeline_id_main)
wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_main, current_lsn_main)
timeline_detail_main = origin_http.timeline_detail(tenant_id, timeline_id_main)
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second)
timeline_detail_second = pageserver_http.timeline_detail(tenant_id, timeline_id_second)
wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_second, current_lsn_second)
timeline_detail_second = origin_http.timeline_detail(tenant_id, timeline_id_second)
if with_load == "with_load":
# create load table
@@ -350,170 +297,149 @@ def test_tenant_relocation(
# if user creates a branch during migration
# it wont appear on the new pageserver
ensure_checkpoint(
pageserver_http=pageserver_http,
pageserver_http=origin_http,
tenant_id=tenant_id,
timeline_id=timeline_id_main,
current_lsn=current_lsn_main,
)
ensure_checkpoint(
pageserver_http=pageserver_http,
pageserver_http=origin_http,
tenant_id=tenant_id,
timeline_id=timeline_id_second,
current_lsn=current_lsn_second,
)
log.info("inititalizing new pageserver")
# bootstrap second pageserver
new_pageserver_dir = env.repo_dir / "new_pageserver"
new_pageserver_dir.mkdir()
# Migrate either by attaching from s3 or import/export basebackup
if method == "major":
cmd = [
"poetry",
"run",
"python",
str(base_dir / "scripts/export_import_between_pageservers.py"),
"--tenant-id",
str(tenant_id),
"--from-host",
"localhost",
"--from-http-port",
str(origin_http.port),
"--from-pg-port",
str(origin_ps.service_port.pg),
"--to-host",
"localhost",
"--to-http-port",
str(destination_http.port),
"--to-pg-port",
str(destination_ps.service_port.pg),
"--pg-distrib-dir",
str(neon_env_builder.pg_distrib_dir),
"--work-dir",
str(test_output_dir),
"--tmp-pg-port",
str(port_distributor.get_port()),
]
subprocess_capture(test_output_dir, cmd, check=True)
new_pageserver_pg_port = port_distributor.get_port()
new_pageserver_http_port = port_distributor.get_port()
log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port)
pageserver_bin = neon_binpath / "pageserver"
destination_ps.allowed_errors.append(
".*ignored .* unexpected bytes after the tar archive.*"
)
elif method == "minor":
# call to attach timeline to new pageserver
destination_ps.tenant_attach(tenant_id)
new_pageserver_http = PageserverHttpClient(
port=new_pageserver_http_port,
auth_token=None,
is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip,
)
# wait for tenant to finish attaching
wait_until(
number_of_iterations=10,
interval=1,
func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"),
)
with new_pageserver_service(
new_pageserver_dir,
pageserver_bin,
remote_storage_mock_path,
new_pageserver_pg_port,
new_pageserver_http_port,
neon_env_builder.broker,
neon_env_builder.pg_distrib_dir,
):
# Migrate either by attaching from s3 or import/export basebackup
if method == "major":
cmd = [
"poetry",
"run",
"python",
str(base_dir / "scripts/export_import_between_pageservers.py"),
"--tenant-id",
str(tenant_id),
"--from-host",
"localhost",
"--from-http-port",
str(pageserver_http.port),
"--from-pg-port",
str(env.pageserver.service_port.pg),
"--to-host",
"localhost",
"--to-http-port",
str(new_pageserver_http_port),
"--to-pg-port",
str(new_pageserver_pg_port),
"--pg-distrib-dir",
str(neon_env_builder.pg_distrib_dir),
"--work-dir",
str(test_output_dir),
"--tmp-pg-port",
str(port_distributor.get_port()),
]
subprocess_capture(test_output_dir, cmd, check=True)
elif method == "minor":
# call to attach timeline to new pageserver
new_pageserver_http.tenant_attach(tenant_id)
# wait for tenant to finish attaching
wait_until(
number_of_iterations=10,
interval=1,
func=lambda: assert_tenant_state(new_pageserver_http, tenant_id, "Active"),
)
check_timeline_attached(
new_pageserver_http,
tenant_id,
timeline_id_main,
timeline_detail_main,
current_lsn_main,
)
check_timeline_attached(
new_pageserver_http,
tenant_id,
timeline_id_second,
timeline_detail_second,
current_lsn_second,
)
# rewrite neon cli config to use new pageserver for basebackup to start new compute
lines = (env.repo_dir / "config").read_text().splitlines()
for i, line in enumerate(lines):
if line.startswith("listen_http_addr"):
lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
if line.startswith("listen_pg_addr"):
lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
(env.repo_dir / "config").write_text("\n".join(lines))
old_local_path_main = switch_pg_to_new_pageserver(
env,
ep_main,
new_pageserver_pg_port,
check_timeline_attached(
destination_http,
tenant_id,
timeline_id_main,
timeline_detail_main,
current_lsn_main,
)
old_local_path_second = switch_pg_to_new_pageserver(
env,
ep_second,
new_pageserver_pg_port,
check_timeline_attached(
destination_http,
tenant_id,
timeline_id_second,
timeline_detail_second,
current_lsn_second,
)
# detach tenant from old pageserver before we check
# that all the data is there to be sure that old pageserver
# is no longer involved, and if it is, we will see the error
pageserver_http.tenant_detach(tenant_id)
# rewrite neon cli config to use new pageserver for basebackup to start new compute
lines = (env.repo_dir / "config").read_text().splitlines()
for i, line in enumerate(lines):
if line.startswith("listen_http_addr"):
lines[i] = f"listen_http_addr = 'localhost:{destination_http.port}'"
if line.startswith("listen_pg_addr"):
lines[i] = f"listen_pg_addr = 'localhost:{destination_ps.service_port.pg}'"
(env.repo_dir / "config").write_text("\n".join(lines))
# Wait a little, so that the detach operation has time to finish.
wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
old_local_path_main = switch_pg_to_new_pageserver(
origin_ps,
ep_main,
destination_ps.service_port.pg,
tenant_id,
timeline_id_main,
)
post_migration_check(ep_main, 500500, old_local_path_main)
post_migration_check(ep_second, 1001000, old_local_path_second)
old_local_path_second = switch_pg_to_new_pageserver(
origin_ps,
ep_second,
destination_ps.service_port.pg,
tenant_id,
timeline_id_second,
)
# ensure that we can successfully read all relations on the new pageserver
with pg_cur(ep_second) as cur:
cur.execute(
"""
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN
SELECT relname FROM pg_class WHERE relkind='r'
LOOP
RAISE NOTICE '%', r.relname;
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
END LOOP;
END$$;
"""
)
# detach tenant from old pageserver before we check
# that all the data is there to be sure that old pageserver
# is no longer involved, and if it is, we will see the error
origin_http.tenant_detach(tenant_id)
if with_load == "with_load":
assert load_ok_event.wait(3)
log.info("stopping load thread")
load_stop_event.set()
load_thread.join(timeout=10)
log.info("load thread stopped")
# Wait a little, so that the detach operation has time to finish.
wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
# bring old pageserver back for clean shutdown via neon cli
# new pageserver will be shut down by the context manager
lines = (env.repo_dir / "config").read_text().splitlines()
for i, line in enumerate(lines):
if line.startswith("listen_http_addr"):
lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
if line.startswith("listen_pg_addr"):
lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
(env.repo_dir / "config").write_text("\n".join(lines))
post_migration_check(ep_main, 500500, old_local_path_main)
post_migration_check(ep_second, 1001000, old_local_path_second)
# ensure that we can successfully read all relations on the new pageserver
with pg_cur(ep_second) as cur:
cur.execute(
"""
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN
SELECT relname FROM pg_class WHERE relkind='r'
LOOP
RAISE NOTICE '%', r.relname;
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
END LOOP;
END$$;
"""
)
if with_load == "with_load":
assert load_ok_event.wait(3)
log.info("stopping load thread")
load_stop_event.set()
load_thread.join(timeout=10)
log.info("load thread stopped")
# bring old pageserver back for clean shutdown via neon cli
# new pageserver will be shut down by the context manager
lines = (env.repo_dir / "config").read_text().splitlines()
for i, line in enumerate(lines):
if line.startswith("listen_http_addr"):
lines[i] = f"listen_http_addr = 'localhost:{origin_ps.service_port.http}'"
if line.startswith("listen_pg_addr"):
lines[i] = f"listen_pg_addr = 'localhost:{origin_ps.service_port.pg}'"
(env.repo_dir / "config").write_text("\n".join(lines))
# Simulate hard crash of pageserver and re-attach a tenant with a branch
@@ -571,7 +497,7 @@ def test_emergency_relocate_with_branches_slow_replay(
# Attach and wait a few seconds to give it time to load the tenants, attach to the
# safekeepers, and to stream and ingest the WAL up to the pause-point.
before_attach_time = time.time()
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
time.sleep(3)
# The wal ingestion on the main timeline should now be paused at the fail point.
@@ -718,7 +644,7 @@ def test_emergency_relocate_with_branches_createdb(
# ingest the WAL, but let's make this less dependent on accidental timing.
pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
before_attach_time = time.time()
pageserver_http.tenant_attach(tenant_id)
env.pageserver.tenant_attach(tenant_id)
child_endpoint.start()
with child_endpoint.cursor(dbname="neondb") as cur:

View File

@@ -297,8 +297,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"
# the remote side of local_layer_truncated
remote_layer_path = (
env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / path.name
remote_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, path.name
)
# if the upload ever was ongoing, this check would be racy, but at least one

View File

@@ -396,7 +396,7 @@ def test_timeline_resurrection_on_attach(
##### Second start, restore the data and ensure that we see only timeline that wasnt deleted
env.pageserver.start()
ps_http.tenant_attach(tenant_id=tenant_id)
env.pageserver.tenant_attach(tenant_id=tenant_id)
wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
@@ -897,7 +897,7 @@ def test_timeline_delete_resumed_on_attach(
env.pageserver.start()
# now we call attach
ps_http.tenant_attach(tenant_id=tenant_id)
env.pageserver.tenant_attach(tenant_id=tenant_id)
# delete should be resumed
wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)

View File

@@ -298,17 +298,21 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
# and wait till remote_consistent_lsn propagates to all safekeepers
#
# TODO: this executes long as timeline on safekeeper is immediately
# deactivated once rcl reaches pageserver one, and thus we generally wait
# till pageserver reconnects to all safekeepers one by one here. Timeline
# status on safekeeper should take into account peers state as well.
# This timeout is long: safekeepers learn about remote_consistent_lsn updates when a pageserver
# connects, receives a PrimaryKeepAlive, and sends a PageserverFeedback. So the timeout has to encompass:
# - pageserver deletion_queue to validate + publish the remote_consistent_lsn
# - pageserver to reconnect to all safekeepers one by one, with multi-second delays between
#
# TODO: timeline status on safekeeper should take into account peers state as well.
rcl_propagate_secs = 60
started_at = time.time()
while True:
stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]):
break
elapsed = time.time() - started_at
if elapsed > 30:
if elapsed > rcl_propagate_secs:
raise RuntimeError(
f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
)