mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
tests: update for tenant generations (#5449)
## Problem Some existing tests are written in a way that's incompatible with tenant generations. ## Summary of changes Update all the tests that need updating: this is things like calling through the NeonPageserver.tenant_attach helper to get a generation number, instead of calling directly into the pageserver API. There are various more subtle cases.
This commit is contained in:
@@ -201,6 +201,12 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
|
||||
// TODO(sharding): make this shard-aware
|
||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
|
||||
let valid = tenant_state.generation == req_tenant.gen;
|
||||
tracing::info!(
|
||||
"handle_validate: {}(gen {}): valid={valid} (latest {})",
|
||||
req_tenant.id,
|
||||
req_tenant.gen,
|
||||
tenant_state.generation
|
||||
);
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
@@ -250,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
tracing::info!(
|
||||
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
|
||||
attach_req.tenant_id,
|
||||
tenant_state.generation,
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
);
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(
|
||||
|
||||
@@ -323,6 +323,7 @@ impl TenantConfigRequest {
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
#[serde(default)]
|
||||
pub config: TenantAttachConfig,
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
@@ -330,7 +331,7 @@ pub struct TenantAttachRequest {
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
/// its usage inside `TenantAttachRequest`.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantAttachConfig {
|
||||
#[serde(flatten)]
|
||||
|
||||
@@ -99,27 +99,35 @@ impl LocalFs {
|
||||
};
|
||||
|
||||
// If we were given a directory, we may use it as our starting point.
|
||||
// Otherwise, we must go up to the parent directory. This is because
|
||||
// Otherwise, we must go up to the first ancestor dir that exists. This is because
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
match fs::metadata(full_path.clone()).await {
|
||||
Ok(meta) => {
|
||||
if !meta.is_dir() {
|
||||
loop {
|
||||
// Did we make it to the root?
|
||||
if initial_dir.parent().is_none() {
|
||||
anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
|
||||
}
|
||||
|
||||
match fs::metadata(initial_dir.clone()).await {
|
||||
Ok(meta) if meta.is_dir() => {
|
||||
// We found a directory, break
|
||||
break;
|
||||
}
|
||||
Ok(_meta) => {
|
||||
// It's not a directory: strip back to the parent
|
||||
initial_dir.pop();
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected I/O error
|
||||
anyhow::bail!(e)
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected I/O error
|
||||
anyhow::bail!(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Note that Utf8PathBuf starts_with only considers full path segments, but
|
||||
// object prefixes are arbitrary strings, so we need the strings for doing
|
||||
// starts_with later.
|
||||
|
||||
@@ -312,7 +312,18 @@ impl ListWriter {
|
||||
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
|
||||
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
|
||||
if attached_gen.previous() == tenant_list.generation {
|
||||
info!(
|
||||
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug(),
|
||||
old_gen=?tenant_list.generation, new_gen=?attached_gen,
|
||||
"Updating gen on recovered list");
|
||||
tenant_list.generation = *attached_gen;
|
||||
} else {
|
||||
info!(
|
||||
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug(),
|
||||
old_gen=?tenant_list.generation, new_gen=?attached_gen,
|
||||
"Encountered stale generation on recovered list");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -266,9 +266,7 @@ class NeonPageserverHttpClient(requests.Session):
|
||||
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
|
||||
res = self.post(
|
||||
f"http://{self.host}:{self.port}/v1/tenant",
|
||||
json={
|
||||
"new_tenant_id": new_tenant_id.hex,
|
||||
},
|
||||
json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
|
||||
)
|
||||
|
||||
if res.status_code == 409:
|
||||
|
||||
@@ -455,7 +455,7 @@ class NeonEnvBuilder:
|
||||
self.preserve_database_files = preserve_database_files
|
||||
self.initial_tenant = initial_tenant or TenantId.generate()
|
||||
self.initial_timeline = initial_timeline or TimelineId.generate()
|
||||
self.enable_generations = False
|
||||
self.enable_generations = True
|
||||
self.scrub_on_exit = False
|
||||
self.test_output_dir = test_output_dir
|
||||
|
||||
@@ -1571,6 +1571,20 @@ class NeonAttachmentService:
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
def inspect(self, tenant_id: TenantId) -> Optional[tuple[int, int]]:
|
||||
response = requests.post(
|
||||
f"{self.env.control_plane_api}/inspect",
|
||||
json={"tenant_id": str(tenant_id)},
|
||||
)
|
||||
response.raise_for_status()
|
||||
json = response.json()
|
||||
log.info(f"Response: {json}")
|
||||
if json["attachment"]:
|
||||
# Explicit int() to make python type linter happy
|
||||
return (int(json["attachment"][0]), int(json["attachment"][1]))
|
||||
else:
|
||||
return None
|
||||
|
||||
def __enter__(self) -> "NeonAttachmentService":
|
||||
return self
|
||||
|
||||
@@ -1769,13 +1783,10 @@ class NeonPageserver(PgProtocol):
|
||||
Tenant attachment passes through here to acquire a generation number before proceeding
|
||||
to call into the pageserver HTTP client.
|
||||
"""
|
||||
if self.env.attachment_service is not None:
|
||||
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
else:
|
||||
generation = None
|
||||
|
||||
client = self.http_client()
|
||||
return client.tenant_attach(tenant_id, config, config_null, generation=generation)
|
||||
return client.tenant_attach(
|
||||
tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
|
||||
)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId):
|
||||
if self.env.attachment_service is not None:
|
||||
@@ -1784,6 +1795,34 @@ class NeonPageserver(PgProtocol):
|
||||
client = self.http_client()
|
||||
return client.tenant_detach(tenant_id)
|
||||
|
||||
def tenant_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
conf: Optional[Dict[str, Any]] = None,
|
||||
auth_token: Optional[str] = None,
|
||||
) -> TenantId:
|
||||
client = self.http_client(auth_token=auth_token)
|
||||
return client.tenant_create(
|
||||
tenant_id, conf, generation=self.maybe_get_generation(tenant_id)
|
||||
)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
client = self.http_client()
|
||||
return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
|
||||
|
||||
def maybe_get_generation(self, tenant_id: TenantId):
|
||||
"""
|
||||
For tests that would like to use an HTTP client directly instead of using
|
||||
the `tenant_attach` and `tenant_create` helpers here: issue a generation
|
||||
number for a tenant.
|
||||
|
||||
Returns None if the attachment service is not enabled (legacy mode)
|
||||
"""
|
||||
if self.env.attachment_service is not None:
|
||||
return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def append_pageserver_param_overrides(
|
||||
params_to_update: List[str],
|
||||
|
||||
@@ -210,16 +210,25 @@ class PageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def tenant_create(
|
||||
self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
|
||||
self,
|
||||
new_tenant_id: TenantId,
|
||||
conf: Optional[Dict[str, Any]] = None,
|
||||
generation: Optional[int] = None,
|
||||
) -> TenantId:
|
||||
if conf is not None:
|
||||
assert "new_tenant_id" not in conf.keys()
|
||||
|
||||
body: Dict[str, Any] = {
|
||||
"new_tenant_id": str(new_tenant_id),
|
||||
**(conf or {}),
|
||||
}
|
||||
|
||||
if generation is not None:
|
||||
body.update({"generation": generation})
|
||||
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant",
|
||||
json={
|
||||
"new_tenant_id": str(new_tenant_id),
|
||||
**(conf or {}),
|
||||
},
|
||||
json=body,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
if res.status_code == 409:
|
||||
@@ -273,8 +282,11 @@ class PageserverHttpClient(requests.Session):
|
||||
self.verbose_error(res)
|
||||
return res
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
|
||||
def tenant_load(self, tenant_id: TenantId, generation=None):
|
||||
body = None
|
||||
if generation is not None:
|
||||
body = {"generation": generation}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_ignore(self, tenant_id: TenantId):
|
||||
|
||||
@@ -6,9 +6,8 @@ from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
|
||||
|
||||
@dataclass
|
||||
class IndexLayerMetadata:
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]):
|
||||
return {}
|
||||
file_size: int
|
||||
generation: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -139,7 +138,7 @@ class IndexPartDump:
|
||||
def from_json(cls, d: Dict[str, Any]) -> "IndexPartDump":
|
||||
return IndexPartDump(
|
||||
layer_metadata={
|
||||
parse_layer_file_name(n): IndexLayerMetadata.from_json(v)
|
||||
parse_layer_file_name(n): IndexLayerMetadata(v["file_size"], v["generation"])
|
||||
for n, v in d["layer_metadata"].items()
|
||||
},
|
||||
disk_consistent_lsn=Lsn(d["disk_consistent_lsn"]),
|
||||
|
||||
@@ -12,7 +12,6 @@ import boto3
|
||||
from mypy_boto3_s3 import S3Client
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.types import LayerFileName
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
|
||||
@@ -88,13 +87,46 @@ class LocalFsStorage:
|
||||
def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
|
||||
return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
|
||||
|
||||
def layer_path(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, layer_file_name: LayerFileName
|
||||
):
|
||||
return self.timeline_path(tenant_id, timeline_id) / layer_file_name.to_str()
|
||||
def timeline_latest_generation(self, tenant_id, timeline_id):
|
||||
timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id))
|
||||
index_parts = [f for f in timeline_files if f.startswith("index_part")]
|
||||
|
||||
def parse_gen(filename):
|
||||
log.info(f"parsing index_part '{filename}'")
|
||||
parts = filename.split("-")
|
||||
if len(parts) == 2:
|
||||
return int(parts[1], 16)
|
||||
else:
|
||||
return None
|
||||
|
||||
generations = sorted([parse_gen(f) for f in index_parts])
|
||||
if len(generations) == 0:
|
||||
raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}")
|
||||
return generations[-1]
|
||||
|
||||
def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
|
||||
return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME
|
||||
latest_gen = self.timeline_latest_generation(tenant_id, timeline_id)
|
||||
if latest_gen is None:
|
||||
filename = TIMELINE_INDEX_PART_FILE_NAME
|
||||
else:
|
||||
filename = f"{TIMELINE_INDEX_PART_FILE_NAME}-{latest_gen:08x}"
|
||||
|
||||
return self.timeline_path(tenant_id, timeline_id) / filename
|
||||
|
||||
def remote_layer_path(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
local_name: str,
|
||||
generation: Optional[int] = None,
|
||||
):
|
||||
if generation is None:
|
||||
generation = self.timeline_latest_generation(tenant_id, timeline_id)
|
||||
|
||||
assert generation is not None, "Cannot calculate remote layer path without generation"
|
||||
|
||||
filename = f"{local_name}-{generation:08x}"
|
||||
return self.timeline_path(tenant_id, timeline_id) / filename
|
||||
|
||||
def index_content(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
with self.index_path(tenant_id, timeline_id).open("r") as f:
|
||||
|
||||
@@ -100,7 +100,6 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
|
||||
|
||||
env = negative_env.neon_env
|
||||
tenant_id = negative_env.tenant_id
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
config_with_unknown_keys = {
|
||||
"compaction_period": "1h",
|
||||
@@ -108,16 +107,16 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
|
||||
}
|
||||
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
ps_http.tenant_attach(tenant_id, config=config_with_unknown_keys)
|
||||
env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys)
|
||||
assert e.type == PageserverApiException
|
||||
assert e.value.status_code == 400
|
||||
|
||||
|
||||
@pytest.mark.parametrize("content_type", [None, "application/json"])
|
||||
def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
|
||||
def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
|
||||
"""
|
||||
For backwards-compatibility: if we send an empty body,
|
||||
the request should be accepted and the config should be the default config.
|
||||
When the 'config' body attribute is omitted, the request should be accepted
|
||||
and the tenant should use the default configuration
|
||||
"""
|
||||
env = positive_env
|
||||
ps_http = env.pageserver.http_client()
|
||||
@@ -128,9 +127,14 @@ def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
|
||||
ps_http.tenant_detach(tenant_id)
|
||||
assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
|
||||
|
||||
body = {}
|
||||
gen = env.pageserver.maybe_get_generation(tenant_id)
|
||||
if gen is not None:
|
||||
body["generation"] = gen
|
||||
|
||||
ps_http.post(
|
||||
f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
|
||||
data=b"",
|
||||
json=body,
|
||||
headers=None if content_type else {"Content-Type": "application/json"},
|
||||
).raise_for_status()
|
||||
|
||||
@@ -191,7 +195,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
}, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
|
||||
|
||||
ps_http.tenant_detach(tenant_id)
|
||||
ps_http.tenant_attach(tenant_id, config=fully_custom_config)
|
||||
env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
|
||||
|
||||
assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
|
||||
assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set(
|
||||
|
||||
@@ -60,14 +60,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
||||
assert_client_authorized(env, invalid_tenant_http_client)
|
||||
|
||||
# create tenant using management token
|
||||
pageserver_http_client.tenant_create(TenantId.generate())
|
||||
env.pageserver.tenant_create(TenantId.generate(), auth_token=pageserver_token)
|
||||
|
||||
# fail to create tenant using tenant token
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match="Forbidden: JWT authentication error",
|
||||
):
|
||||
tenant_http_client.tenant_create(TenantId.generate())
|
||||
env.pageserver.tenant_create(TenantId.generate(), auth_token=tenant_token)
|
||||
|
||||
|
||||
def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -158,7 +158,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
|
||||
|
||||
# pause all uploads
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
|
||||
ps_http.tenant_create(env.initial_tenant)
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
|
||||
initial_branch = "initial_branch"
|
||||
|
||||
@@ -200,7 +200,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
# pause all uploads
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
|
||||
ps_http.tenant_create(env.initial_tenant)
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
|
||||
def start_creating_timeline():
|
||||
with pytest.raises(RequestException):
|
||||
@@ -257,7 +257,7 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
|
||||
|
||||
# pause all uploads
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
|
||||
ps_http.tenant_create(env.initial_tenant)
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
|
||||
def start_creating_timeline():
|
||||
ps_http.timeline_create(
|
||||
@@ -343,8 +343,7 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
|
||||
)
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# pause all uploads
|
||||
ps_http.tenant_create(env.initial_tenant)
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
|
||||
# Create a timeline whose creation will succeed. The tenant will need at least one
|
||||
# timeline to be loadable.
|
||||
@@ -397,7 +396,7 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB
|
||||
)
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
ps_http.tenant_create(env.initial_tenant)
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
ps_http.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# pause all uploads
|
||||
|
||||
@@ -160,7 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
|
||||
]
|
||||
)
|
||||
|
||||
pageserver_http.tenant_create(env.initial_tenant)
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
||||
|
||||
@@ -14,6 +14,11 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
for pageserver in env.pageservers:
|
||||
# This test dual-attaches a tenant, one of the pageservers will therefore
|
||||
# be running with a stale generation.
|
||||
pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
|
||||
|
||||
env.neon_cli.create_branch("test_change_pageserver")
|
||||
endpoint = env.endpoints.create_start("test_change_pageserver")
|
||||
|
||||
@@ -79,6 +84,11 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
# Try failing back, and this time we will stop the current pageserver before reconfiguring
|
||||
# the endpoint. Whereas the previous reconfiguration was like a healthy migration, this
|
||||
# is more like what happens in an unexpected pageserver failure.
|
||||
#
|
||||
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
|
||||
# about to start as the attached pageserver
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
|
||||
env.pageservers[0].start()
|
||||
env.pageservers[1].stop()
|
||||
|
||||
@@ -88,6 +98,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
assert fetchone() == (100000,)
|
||||
|
||||
env.pageservers[0].stop()
|
||||
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
|
||||
# about to start as the attached pageserver
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
|
||||
env.pageservers[1].start()
|
||||
|
||||
# Test a (former) bug where a child process spins without updating its connection string
|
||||
|
||||
@@ -112,7 +112,9 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
|
||||
assert l1_found is not None, "failed to find L1 locally"
|
||||
|
||||
uploaded = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name
|
||||
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, l1_found.name
|
||||
)
|
||||
assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
|
||||
|
||||
env.pageserver.start()
|
||||
@@ -139,4 +141,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
|
||||
|
||||
uploaded = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, l1_found.name
|
||||
)
|
||||
assert uploaded.exists(), "the L1 is uploaded"
|
||||
|
||||
@@ -84,8 +84,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
client.tenant_create(tenant)
|
||||
env.pageserver.tenant_create(tenant)
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
@@ -149,6 +148,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
|
||||
".*WARN.*ignored .* unexpected bytes after the tar archive.*"
|
||||
)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
timeline_delete_wait_completed(client, tenant, timeline)
|
||||
|
||||
# Importing correct backup works
|
||||
@@ -292,7 +292,7 @@ def _import(
|
||||
# Import to pageserver
|
||||
endpoint_id = "ep-import_from_pageserver"
|
||||
client = env.pageserver.http_client()
|
||||
client.tenant_create(tenant)
|
||||
env.pageserver.tenant_create(tenant)
|
||||
env.neon_cli.raw_cli(
|
||||
[
|
||||
"timeline",
|
||||
|
||||
@@ -149,19 +149,28 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
|
||||
)
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
future_layer_path = env.pageserver_remote_storage.layer_path(
|
||||
tenant_id, timeline_id, future_layer
|
||||
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, future_layer.to_str()
|
||||
)
|
||||
log.info(f"future layer path: {future_layer_path}")
|
||||
pre_stat = future_layer_path.stat()
|
||||
time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites
|
||||
|
||||
def get_generation_number():
|
||||
assert env.attachment_service is not None
|
||||
attachment = env.attachment_service.inspect(tenant_id)
|
||||
assert attachment is not None
|
||||
return attachment[0]
|
||||
|
||||
# force removal of layers from the future
|
||||
tenant_conf = ps_http.tenant_config(tenant_id)
|
||||
ps_http.tenant_detach(tenant_id)
|
||||
generation_before_detach = get_generation_number()
|
||||
env.pageserver.tenant_detach(tenant_id)
|
||||
failpoint_name = "before-delete-layer-pausable"
|
||||
|
||||
ps_http.configure_failpoints((failpoint_name, "pause"))
|
||||
ps_http.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
|
||||
env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
|
||||
generation_after_reattach = get_generation_number()
|
||||
wait_until_tenant_active(ps_http, tenant_id)
|
||||
|
||||
# Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
|
||||
@@ -177,6 +186,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
|
||||
|
||||
wait_until(10, 0.5, delete_at_pause_point)
|
||||
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
|
||||
)
|
||||
log.info(f"future layer path: {future_layer_path}")
|
||||
assert future_layer_path.exists()
|
||||
|
||||
# wait for re-ingestion of the WAL from safekeepers into the in-memory layer
|
||||
@@ -215,12 +228,17 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
# Examine the resulting S3 state.
|
||||
log.info("integrity-check the remote storage")
|
||||
ip = get_index_part()
|
||||
for layer_file_name in ip.layer_metadata.keys():
|
||||
layer_path = env.pageserver_remote_storage.layer_path(
|
||||
tenant_id, timeline_id, layer_file_name
|
||||
for layer_file_name, layer_metadata in ip.layer_metadata.items():
|
||||
log.info(f"Layer metadata {layer_file_name.to_str()}: {layer_metadata}")
|
||||
layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, layer_file_name.to_str(), layer_metadata.generation
|
||||
)
|
||||
assert layer_path.exists(), f"{layer_file_name.to_str()}"
|
||||
|
||||
log.info("assert that the overwritten layer won")
|
||||
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, future_layer.to_str(), generation=generation_after_reattach
|
||||
)
|
||||
final_stat = future_layer_path.stat()
|
||||
log.info(f"future layer path: {future_layer_path}")
|
||||
assert final_stat.st_mtime != pre_stat.st_mtime
|
||||
|
||||
@@ -133,6 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
|
||||
# Stop default ps/sk
|
||||
env.neon_cli.pageserver_stop(env.pageserver.id)
|
||||
env.neon_cli.safekeeper_stop()
|
||||
env.neon_cli.attachment_service_stop(False)
|
||||
|
||||
# Keep NeonEnv state up to date, it usually owns starting/stopping services
|
||||
env.pageserver.running = False
|
||||
@@ -173,6 +174,9 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
|
||||
env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1)
|
||||
env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
|
||||
|
||||
# Stop this to get out of the way of the following `start`
|
||||
env.neon_cli.attachment_service_stop(False)
|
||||
|
||||
# Default start
|
||||
res = env.neon_cli.raw_cli(["start"])
|
||||
res.check_returncode()
|
||||
|
||||
@@ -8,7 +8,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
@@ -62,7 +61,10 @@ def test_pageserver_init_node_id(
|
||||
assert "has node id already, it cannot be overridden" in bad_update.stderr
|
||||
|
||||
|
||||
def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_tenant: TenantId):
|
||||
def check_client(env: NeonEnv, client: PageserverHttpClient):
|
||||
pg_version = env.pg_version
|
||||
initial_tenant = env.initial_tenant
|
||||
|
||||
client.check_status()
|
||||
|
||||
# check initial tenant is there
|
||||
@@ -70,7 +72,7 @@ def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_te
|
||||
|
||||
# create new tenant and check it is also there
|
||||
tenant_id = TenantId.generate()
|
||||
client.tenant_create(tenant_id)
|
||||
client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
|
||||
assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
|
||||
|
||||
timelines = client.timeline_list(tenant_id)
|
||||
@@ -181,7 +183,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
|
||||
def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
with env.pageserver.http_client() as client:
|
||||
check_client(env.pg_version, client, env.initial_tenant)
|
||||
check_client(env, client)
|
||||
|
||||
|
||||
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -191,4 +193,4 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
|
||||
pageserver_token = env.auth_keys.generate_pageserver_token()
|
||||
|
||||
with env.pageserver.http_client(auth_token=pageserver_token) as client:
|
||||
check_client(env.pg_version, client, env.initial_tenant)
|
||||
check_client(env, client)
|
||||
|
||||
@@ -23,7 +23,6 @@ from fixtures.pageserver.utils import (
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import (
|
||||
TIMELINE_INDEX_PART_FILE_NAME,
|
||||
LocalFsStorage,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
@@ -350,6 +349,13 @@ def test_remote_storage_upload_queue_retries(
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.endpoints.stop_all()
|
||||
|
||||
# We are about to forcibly drop local dirs. Attachment service will increment generation in re-attach before
|
||||
# we later increment when actually attaching it again, leading to skipping a generation and potentially getting
|
||||
# these warnings if there was a durable but un-executed deletion list at time of restart.
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
|
||||
)
|
||||
|
||||
dir_to_clear = env.pageserver.tenant_dir()
|
||||
shutil.rmtree(dir_to_clear)
|
||||
os.mkdir(dir_to_clear)
|
||||
@@ -648,7 +654,7 @@ def test_empty_branch_remote_storage_upload(neon_env_builder: NeonEnvBuilder):
|
||||
), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}"
|
||||
|
||||
client.tenant_detach(env.initial_tenant)
|
||||
client.tenant_attach(env.initial_tenant)
|
||||
env.pageserver.tenant_attach(env.initial_tenant)
|
||||
wait_until_tenant_state(client, env.initial_tenant, "Active", 5)
|
||||
|
||||
timelines_after_detach = set(
|
||||
@@ -758,10 +764,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
|
||||
# this is because creating a timeline always awaits for the uploads to complete
|
||||
assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
|
||||
|
||||
assert (
|
||||
new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME
|
||||
assert env.pageserver_remote_storage.index_path(
|
||||
env.initial_tenant, new_branch_timeline_id
|
||||
).is_file(), "uploads scheduled during initial load should had been awaited for"
|
||||
finally:
|
||||
barrier.abort()
|
||||
create_thread.join()
|
||||
|
||||
|
||||
|
||||
@@ -314,7 +314,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
assert not config_path.exists(), "detach did not remove config file"
|
||||
|
||||
http_client.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
wait_until(
|
||||
number_of_iterations=5,
|
||||
interval=1,
|
||||
|
||||
@@ -380,7 +380,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
env.pageserver.start()
|
||||
|
||||
# now we call attach
|
||||
ps_http.tenant_attach(tenant_id=tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id=tenant_id)
|
||||
|
||||
# delete should be resumed
|
||||
wait_tenant_status_404(ps_http, tenant_id, iterations)
|
||||
@@ -419,7 +419,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
|
||||
f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
|
||||
)
|
||||
|
||||
pageserver_http.tenant_create(env.initial_tenant)
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
|
||||
failpoint = "flush-layer-cancel-after-writing-layer-out-pausable"
|
||||
pageserver_http.configure_failpoints((failpoint, "pause"))
|
||||
|
||||
@@ -82,6 +82,10 @@ def test_tenant_reattach(
|
||||
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# Our re-attach may race with the deletion queue processing LSN updates
|
||||
# from the original attachment.
|
||||
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
||||
@@ -112,8 +116,8 @@ def test_tenant_reattach(
|
||||
|
||||
if mode == ReattachMode.REATTACH_EXPLICIT:
|
||||
# Explicitly detach then attach the tenant as two separate API calls
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_detach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
|
||||
# Use the reset API to detach/attach in one shot
|
||||
pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
|
||||
@@ -192,6 +196,9 @@ def test_tenant_reattach_while_busy(
|
||||
updates_finished = 0
|
||||
updates_to_perform = 0
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Run random UPDATEs on test table. On failure, try again.
|
||||
async def update_table(pg_conn: asyncpg.Connection):
|
||||
nonlocal updates_started, updates_finished, updates_to_perform
|
||||
@@ -223,7 +230,7 @@ def test_tenant_reattach_while_busy(
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
await asyncio.sleep(1)
|
||||
log.info("Re-attaching tenant")
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
log.info("Re-attach finished")
|
||||
|
||||
# Continue with 5000 more updates
|
||||
@@ -244,9 +251,6 @@ def test_tenant_reattach_while_busy(
|
||||
|
||||
assert updates_finished == updates_to_perform
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
# create new nenant
|
||||
@@ -454,6 +458,10 @@ def test_detach_while_attaching(
|
||||
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# Our re-attach may race with the deletion queue processing LSN updates
|
||||
# from the original attachment.
|
||||
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
|
||||
|
||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||
# shared_buffers, otherwise the SELECT after restart will just return answer
|
||||
# from shared_buffers without hitting the page server, which defeats the point
|
||||
@@ -487,7 +495,7 @@ def test_detach_while_attaching(
|
||||
# And re-attach
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
|
||||
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
# Before it has chance to finish, detach it again
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
@@ -497,7 +505,7 @@ def test_detach_while_attaching(
|
||||
|
||||
# Attach it again. If the GC and compaction loops from the previous attach/detach
|
||||
# cycle are still running, things could get really confusing..
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT COUNT(*) FROM foo")
|
||||
@@ -556,7 +564,7 @@ def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
|
||||
), "Ignored tenant should not be reloaded after pageserver restart"
|
||||
|
||||
# now, load it from the local files and expect it works
|
||||
pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
|
||||
env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
|
||||
|
||||
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
@@ -611,7 +619,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
|
||||
assert layers_removed, f"Found no layers for tenant {timeline_dir}"
|
||||
|
||||
# now, load it from the local files and expect it to work due to remote storage restoration
|
||||
pageserver_http.tenant_load(tenant_id=tenant_id)
|
||||
env.pageserver.tenant_load(tenant_id=tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
@@ -645,13 +653,13 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"tenant {tenant_id} already exists, state: Active",
|
||||
):
|
||||
pageserver_http.tenant_load(tenant_id)
|
||||
env.pageserver.tenant_load(tenant_id)
|
||||
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"tenant {tenant_id} already exists, state: Active",
|
||||
):
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
|
||||
@@ -660,7 +668,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
expected_exception=PageserverApiException,
|
||||
match="tenant directory already exists",
|
||||
):
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
|
||||
def test_ignore_while_attaching(
|
||||
@@ -679,6 +687,10 @@ def test_ignore_while_attaching(
|
||||
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# Our re-attach may race with the deletion queue processing LSN updates
|
||||
# from the original attachment.
|
||||
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
|
||||
|
||||
data_id = 1
|
||||
data_secret = "very secret secret"
|
||||
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
|
||||
@@ -689,7 +701,7 @@ def test_ignore_while_attaching(
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
# And re-attach, but stop attach task_mgr task from completing
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
# Run ignore on the task, thereby cancelling the attach.
|
||||
# XXX This should take priority over attach, i.e., it should cancel the attach task.
|
||||
# But neither the failpoint, nor the proper remote_timeline_client download functions,
|
||||
@@ -704,7 +716,7 @@ def test_ignore_while_attaching(
|
||||
expected_exception=PageserverApiException,
|
||||
match="tenant directory already exists",
|
||||
):
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
|
||||
@@ -714,7 +726,7 @@ def test_ignore_while_attaching(
|
||||
|
||||
# Calling load will bring the tenant back online
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "off")])
|
||||
pageserver_http.tenant_load(tenant_id)
|
||||
env.pageserver.tenant_load(tenant_id)
|
||||
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
@@ -818,7 +830,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
found_broken
|
||||
), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
|
||||
|
||||
client.tenant_load(env.initial_tenant)
|
||||
env.pageserver.tenant_load(env.initial_tenant)
|
||||
|
||||
found_active = False
|
||||
active, broken_set = ([], [])
|
||||
|
||||
@@ -7,13 +7,8 @@ from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
from fixtures.broker import NeonBroker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
Endpoint,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
@@ -30,7 +25,6 @@ from fixtures.remote_storage import (
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import (
|
||||
query_scalar,
|
||||
start_in_background,
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
)
|
||||
@@ -40,58 +34,6 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
|
||||
assert abs(a - b) / a < margin_ratio, abs(a - b) / a
|
||||
|
||||
|
||||
@contextmanager
|
||||
def new_pageserver_service(
|
||||
new_pageserver_dir: Path,
|
||||
pageserver_bin: Path,
|
||||
remote_storage_mock_path: Path,
|
||||
pg_port: int,
|
||||
http_port: int,
|
||||
broker: Optional[NeonBroker],
|
||||
pg_distrib_dir: Path,
|
||||
):
|
||||
"""
|
||||
cannot use NeonPageserver yet because it depends on neon cli
|
||||
which currently lacks support for multiple pageservers
|
||||
"""
|
||||
# actually run new pageserver
|
||||
cmd = [
|
||||
str(pageserver_bin),
|
||||
"--workdir",
|
||||
str(new_pageserver_dir),
|
||||
"--update-config",
|
||||
f"-c listen_pg_addr='localhost:{pg_port}'",
|
||||
f"-c listen_http_addr='localhost:{http_port}'",
|
||||
f"-c pg_distrib_dir='{pg_distrib_dir}'",
|
||||
"-c id=2",
|
||||
f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}",
|
||||
]
|
||||
if broker is not None:
|
||||
cmd.append(
|
||||
f"-c broker_endpoint='{broker.client_url()}'",
|
||||
)
|
||||
pageserver_client = PageserverHttpClient(
|
||||
port=http_port,
|
||||
auth_token=None,
|
||||
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
|
||||
)
|
||||
try:
|
||||
pageserver_process = start_in_background(
|
||||
cmd, new_pageserver_dir, "pageserver.log", pageserver_client.check_status
|
||||
)
|
||||
except Exception as e:
|
||||
log.error(e)
|
||||
pageserver_process.kill()
|
||||
raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") from e
|
||||
|
||||
log.info("new pageserver started")
|
||||
try:
|
||||
yield pageserver_process
|
||||
finally:
|
||||
log.info("stopping new pageserver")
|
||||
pageserver_process.kill()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def pg_cur(endpoint):
|
||||
with closing(endpoint.connect()) as conn:
|
||||
@@ -201,7 +143,7 @@ def check_timeline_attached(
|
||||
|
||||
|
||||
def switch_pg_to_new_pageserver(
|
||||
env: NeonEnv,
|
||||
origin_ps: NeonPageserver,
|
||||
endpoint: Endpoint,
|
||||
new_pageserver_port: int,
|
||||
tenant_id: TenantId,
|
||||
@@ -216,7 +158,7 @@ def switch_pg_to_new_pageserver(
|
||||
|
||||
endpoint.start()
|
||||
|
||||
timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id)
|
||||
files_before_detach = os.listdir(timeline_to_detach_local_path)
|
||||
assert (
|
||||
"metadata" in files_before_detach
|
||||
@@ -269,27 +211,32 @@ def test_tenant_relocation(
|
||||
with_load: str,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
neon_env_builder.num_pageservers = 2
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
env.pageservers[0].allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
# Needed for detach polling.
|
||||
env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
|
||||
# Needed for detach polling on the original pageserver
|
||||
env.pageservers[0].allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
|
||||
# We will dual-attach in this test, so stale generations are expected
|
||||
env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates.*")
|
||||
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
remote_storage_mock_path = env.pageserver_remote_storage.root
|
||||
|
||||
# we use two branches to check that they are both relocated
|
||||
# first branch is used for load, compute for second one is used to
|
||||
# check that data is not lost
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
origin_ps = env.pageservers[0]
|
||||
destination_ps = env.pageservers[1]
|
||||
origin_http = origin_ps.http_client()
|
||||
destination_http = destination_ps.http_client()
|
||||
|
||||
_, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
|
||||
log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
|
||||
@@ -302,7 +249,7 @@ def test_tenant_relocation(
|
||||
timeline_id_main, current_lsn_main = populate_branch(
|
||||
ep_main,
|
||||
tenant_id=tenant_id,
|
||||
ps_http=pageserver_http,
|
||||
ps_http=origin_http,
|
||||
create_table=True,
|
||||
expected_sum=500500,
|
||||
)
|
||||
@@ -320,17 +267,17 @@ def test_tenant_relocation(
|
||||
timeline_id_second, current_lsn_second = populate_branch(
|
||||
ep_second,
|
||||
tenant_id=tenant_id,
|
||||
ps_http=pageserver_http,
|
||||
ps_http=origin_http,
|
||||
create_table=False,
|
||||
expected_sum=1001000,
|
||||
)
|
||||
|
||||
# wait until pageserver receives that data
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main)
|
||||
timeline_detail_main = pageserver_http.timeline_detail(tenant_id, timeline_id_main)
|
||||
wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_main, current_lsn_main)
|
||||
timeline_detail_main = origin_http.timeline_detail(tenant_id, timeline_id_main)
|
||||
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second)
|
||||
timeline_detail_second = pageserver_http.timeline_detail(tenant_id, timeline_id_second)
|
||||
wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_second, current_lsn_second)
|
||||
timeline_detail_second = origin_http.timeline_detail(tenant_id, timeline_id_second)
|
||||
|
||||
if with_load == "with_load":
|
||||
# create load table
|
||||
@@ -350,170 +297,149 @@ def test_tenant_relocation(
|
||||
# if user creates a branch during migration
|
||||
# it wont appear on the new pageserver
|
||||
ensure_checkpoint(
|
||||
pageserver_http=pageserver_http,
|
||||
pageserver_http=origin_http,
|
||||
tenant_id=tenant_id,
|
||||
timeline_id=timeline_id_main,
|
||||
current_lsn=current_lsn_main,
|
||||
)
|
||||
|
||||
ensure_checkpoint(
|
||||
pageserver_http=pageserver_http,
|
||||
pageserver_http=origin_http,
|
||||
tenant_id=tenant_id,
|
||||
timeline_id=timeline_id_second,
|
||||
current_lsn=current_lsn_second,
|
||||
)
|
||||
|
||||
log.info("inititalizing new pageserver")
|
||||
# bootstrap second pageserver
|
||||
new_pageserver_dir = env.repo_dir / "new_pageserver"
|
||||
new_pageserver_dir.mkdir()
|
||||
# Migrate either by attaching from s3 or import/export basebackup
|
||||
if method == "major":
|
||||
cmd = [
|
||||
"poetry",
|
||||
"run",
|
||||
"python",
|
||||
str(base_dir / "scripts/export_import_between_pageservers.py"),
|
||||
"--tenant-id",
|
||||
str(tenant_id),
|
||||
"--from-host",
|
||||
"localhost",
|
||||
"--from-http-port",
|
||||
str(origin_http.port),
|
||||
"--from-pg-port",
|
||||
str(origin_ps.service_port.pg),
|
||||
"--to-host",
|
||||
"localhost",
|
||||
"--to-http-port",
|
||||
str(destination_http.port),
|
||||
"--to-pg-port",
|
||||
str(destination_ps.service_port.pg),
|
||||
"--pg-distrib-dir",
|
||||
str(neon_env_builder.pg_distrib_dir),
|
||||
"--work-dir",
|
||||
str(test_output_dir),
|
||||
"--tmp-pg-port",
|
||||
str(port_distributor.get_port()),
|
||||
]
|
||||
subprocess_capture(test_output_dir, cmd, check=True)
|
||||
|
||||
new_pageserver_pg_port = port_distributor.get_port()
|
||||
new_pageserver_http_port = port_distributor.get_port()
|
||||
log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port)
|
||||
pageserver_bin = neon_binpath / "pageserver"
|
||||
destination_ps.allowed_errors.append(
|
||||
".*ignored .* unexpected bytes after the tar archive.*"
|
||||
)
|
||||
elif method == "minor":
|
||||
# call to attach timeline to new pageserver
|
||||
destination_ps.tenant_attach(tenant_id)
|
||||
|
||||
new_pageserver_http = PageserverHttpClient(
|
||||
port=new_pageserver_http_port,
|
||||
auth_token=None,
|
||||
is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip,
|
||||
)
|
||||
# wait for tenant to finish attaching
|
||||
wait_until(
|
||||
number_of_iterations=10,
|
||||
interval=1,
|
||||
func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"),
|
||||
)
|
||||
|
||||
with new_pageserver_service(
|
||||
new_pageserver_dir,
|
||||
pageserver_bin,
|
||||
remote_storage_mock_path,
|
||||
new_pageserver_pg_port,
|
||||
new_pageserver_http_port,
|
||||
neon_env_builder.broker,
|
||||
neon_env_builder.pg_distrib_dir,
|
||||
):
|
||||
# Migrate either by attaching from s3 or import/export basebackup
|
||||
if method == "major":
|
||||
cmd = [
|
||||
"poetry",
|
||||
"run",
|
||||
"python",
|
||||
str(base_dir / "scripts/export_import_between_pageservers.py"),
|
||||
"--tenant-id",
|
||||
str(tenant_id),
|
||||
"--from-host",
|
||||
"localhost",
|
||||
"--from-http-port",
|
||||
str(pageserver_http.port),
|
||||
"--from-pg-port",
|
||||
str(env.pageserver.service_port.pg),
|
||||
"--to-host",
|
||||
"localhost",
|
||||
"--to-http-port",
|
||||
str(new_pageserver_http_port),
|
||||
"--to-pg-port",
|
||||
str(new_pageserver_pg_port),
|
||||
"--pg-distrib-dir",
|
||||
str(neon_env_builder.pg_distrib_dir),
|
||||
"--work-dir",
|
||||
str(test_output_dir),
|
||||
"--tmp-pg-port",
|
||||
str(port_distributor.get_port()),
|
||||
]
|
||||
subprocess_capture(test_output_dir, cmd, check=True)
|
||||
elif method == "minor":
|
||||
# call to attach timeline to new pageserver
|
||||
new_pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
# wait for tenant to finish attaching
|
||||
wait_until(
|
||||
number_of_iterations=10,
|
||||
interval=1,
|
||||
func=lambda: assert_tenant_state(new_pageserver_http, tenant_id, "Active"),
|
||||
)
|
||||
|
||||
check_timeline_attached(
|
||||
new_pageserver_http,
|
||||
tenant_id,
|
||||
timeline_id_main,
|
||||
timeline_detail_main,
|
||||
current_lsn_main,
|
||||
)
|
||||
|
||||
check_timeline_attached(
|
||||
new_pageserver_http,
|
||||
tenant_id,
|
||||
timeline_id_second,
|
||||
timeline_detail_second,
|
||||
current_lsn_second,
|
||||
)
|
||||
|
||||
# rewrite neon cli config to use new pageserver for basebackup to start new compute
|
||||
lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("listen_http_addr"):
|
||||
lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
|
||||
if line.startswith("listen_pg_addr"):
|
||||
lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(lines))
|
||||
|
||||
old_local_path_main = switch_pg_to_new_pageserver(
|
||||
env,
|
||||
ep_main,
|
||||
new_pageserver_pg_port,
|
||||
check_timeline_attached(
|
||||
destination_http,
|
||||
tenant_id,
|
||||
timeline_id_main,
|
||||
timeline_detail_main,
|
||||
current_lsn_main,
|
||||
)
|
||||
|
||||
old_local_path_second = switch_pg_to_new_pageserver(
|
||||
env,
|
||||
ep_second,
|
||||
new_pageserver_pg_port,
|
||||
check_timeline_attached(
|
||||
destination_http,
|
||||
tenant_id,
|
||||
timeline_id_second,
|
||||
timeline_detail_second,
|
||||
current_lsn_second,
|
||||
)
|
||||
|
||||
# detach tenant from old pageserver before we check
|
||||
# that all the data is there to be sure that old pageserver
|
||||
# is no longer involved, and if it is, we will see the error
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
# rewrite neon cli config to use new pageserver for basebackup to start new compute
|
||||
lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("listen_http_addr"):
|
||||
lines[i] = f"listen_http_addr = 'localhost:{destination_http.port}'"
|
||||
if line.startswith("listen_pg_addr"):
|
||||
lines[i] = f"listen_pg_addr = 'localhost:{destination_ps.service_port.pg}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(lines))
|
||||
|
||||
# Wait a little, so that the detach operation has time to finish.
|
||||
wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
|
||||
old_local_path_main = switch_pg_to_new_pageserver(
|
||||
origin_ps,
|
||||
ep_main,
|
||||
destination_ps.service_port.pg,
|
||||
tenant_id,
|
||||
timeline_id_main,
|
||||
)
|
||||
|
||||
post_migration_check(ep_main, 500500, old_local_path_main)
|
||||
post_migration_check(ep_second, 1001000, old_local_path_second)
|
||||
old_local_path_second = switch_pg_to_new_pageserver(
|
||||
origin_ps,
|
||||
ep_second,
|
||||
destination_ps.service_port.pg,
|
||||
tenant_id,
|
||||
timeline_id_second,
|
||||
)
|
||||
|
||||
# ensure that we can successfully read all relations on the new pageserver
|
||||
with pg_cur(ep_second) as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
DO $$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
BEGIN
|
||||
FOR r IN
|
||||
SELECT relname FROM pg_class WHERE relkind='r'
|
||||
LOOP
|
||||
RAISE NOTICE '%', r.relname;
|
||||
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
|
||||
END LOOP;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
# detach tenant from old pageserver before we check
|
||||
# that all the data is there to be sure that old pageserver
|
||||
# is no longer involved, and if it is, we will see the error
|
||||
origin_http.tenant_detach(tenant_id)
|
||||
|
||||
if with_load == "with_load":
|
||||
assert load_ok_event.wait(3)
|
||||
log.info("stopping load thread")
|
||||
load_stop_event.set()
|
||||
load_thread.join(timeout=10)
|
||||
log.info("load thread stopped")
|
||||
# Wait a little, so that the detach operation has time to finish.
|
||||
wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
|
||||
|
||||
# bring old pageserver back for clean shutdown via neon cli
|
||||
# new pageserver will be shut down by the context manager
|
||||
lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("listen_http_addr"):
|
||||
lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
|
||||
if line.startswith("listen_pg_addr"):
|
||||
lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(lines))
|
||||
post_migration_check(ep_main, 500500, old_local_path_main)
|
||||
post_migration_check(ep_second, 1001000, old_local_path_second)
|
||||
|
||||
# ensure that we can successfully read all relations on the new pageserver
|
||||
with pg_cur(ep_second) as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
DO $$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
BEGIN
|
||||
FOR r IN
|
||||
SELECT relname FROM pg_class WHERE relkind='r'
|
||||
LOOP
|
||||
RAISE NOTICE '%', r.relname;
|
||||
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
|
||||
END LOOP;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
if with_load == "with_load":
|
||||
assert load_ok_event.wait(3)
|
||||
log.info("stopping load thread")
|
||||
load_stop_event.set()
|
||||
load_thread.join(timeout=10)
|
||||
log.info("load thread stopped")
|
||||
|
||||
# bring old pageserver back for clean shutdown via neon cli
|
||||
# new pageserver will be shut down by the context manager
|
||||
lines = (env.repo_dir / "config").read_text().splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("listen_http_addr"):
|
||||
lines[i] = f"listen_http_addr = 'localhost:{origin_ps.service_port.http}'"
|
||||
if line.startswith("listen_pg_addr"):
|
||||
lines[i] = f"listen_pg_addr = 'localhost:{origin_ps.service_port.pg}'"
|
||||
(env.repo_dir / "config").write_text("\n".join(lines))
|
||||
|
||||
|
||||
# Simulate hard crash of pageserver and re-attach a tenant with a branch
|
||||
@@ -571,7 +497,7 @@ def test_emergency_relocate_with_branches_slow_replay(
|
||||
# Attach and wait a few seconds to give it time to load the tenants, attach to the
|
||||
# safekeepers, and to stream and ingest the WAL up to the pause-point.
|
||||
before_attach_time = time.time()
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
time.sleep(3)
|
||||
|
||||
# The wal ingestion on the main timeline should now be paused at the fail point.
|
||||
@@ -718,7 +644,7 @@ def test_emergency_relocate_with_branches_createdb(
|
||||
# ingest the WAL, but let's make this less dependent on accidental timing.
|
||||
pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
|
||||
before_attach_time = time.time()
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
child_endpoint.start()
|
||||
with child_endpoint.cursor(dbname="neondb") as cur:
|
||||
|
||||
@@ -297,8 +297,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"
|
||||
|
||||
# the remote side of local_layer_truncated
|
||||
remote_layer_path = (
|
||||
env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / path.name
|
||||
remote_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, path.name
|
||||
)
|
||||
|
||||
# if the upload ever was ongoing, this check would be racy, but at least one
|
||||
|
||||
@@ -396,7 +396,7 @@ def test_timeline_resurrection_on_attach(
|
||||
##### Second start, restore the data and ensure that we see only timeline that wasnt deleted
|
||||
env.pageserver.start()
|
||||
|
||||
ps_http.tenant_attach(tenant_id=tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id=tenant_id)
|
||||
|
||||
wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
|
||||
|
||||
@@ -897,7 +897,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
env.pageserver.start()
|
||||
|
||||
# now we call attach
|
||||
ps_http.tenant_attach(tenant_id=tenant_id)
|
||||
env.pageserver.tenant_attach(tenant_id=tenant_id)
|
||||
|
||||
# delete should be resumed
|
||||
wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
|
||||
|
||||
@@ -298,17 +298,21 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# and wait till remote_consistent_lsn propagates to all safekeepers
|
||||
#
|
||||
# TODO: this executes long as timeline on safekeeper is immediately
|
||||
# deactivated once rcl reaches pageserver one, and thus we generally wait
|
||||
# till pageserver reconnects to all safekeepers one by one here. Timeline
|
||||
# status on safekeeper should take into account peers state as well.
|
||||
# This timeout is long: safekeepers learn about remote_consistent_lsn updates when a pageserver
|
||||
# connects, receives a PrimaryKeepAlive, and sends a PageserverFeedback. So the timeout has to encompass:
|
||||
# - pageserver deletion_queue to validate + publish the remote_consistent_lsn
|
||||
# - pageserver to reconnect to all safekeepers one by one, with multi-second delays between
|
||||
#
|
||||
# TODO: timeline status on safekeeper should take into account peers state as well.
|
||||
rcl_propagate_secs = 60
|
||||
|
||||
started_at = time.time()
|
||||
while True:
|
||||
stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
|
||||
if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]):
|
||||
break
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > 30:
|
||||
if elapsed > rcl_propagate_secs:
|
||||
raise RuntimeError(
|
||||
f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user