fix(test_regress): always use storcon virtual pageserver API to set tenant config (#9622)

Problem
-------

Tests that directly call the Pageserver Management API to set tenant
config are flaky if the Pageserver is managed by Storcon because Storcon
is the source of truth and may (theoretically) reconcile a tenant at any
time.

Solution
--------

Switch all users of
`set_tenant_config`/`patch_tenant_config_client_side`
to use the `env.storage_controller.pageserver_api()`

Future Work
-----------

Prevent regressions from creeping in.

And generally clean up up tenant configuration.
Maybe we can avoid the Pageserver having a default tenant config at all
and put the default into Storcon instead?

* => https://github.com/neondatabase/neon/issues/9621

Refs
----

fixes https://github.com/neondatabase/neon/issues/9522
This commit is contained in:
Christian Schwarz
2024-11-04 17:42:08 +01:00
committed by GitHub
parent 0d5a512825
commit 06113e94e6
9 changed files with 44 additions and 22 deletions

View File

@@ -404,6 +404,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
return res.json()
def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
"""
Only use this via storage_controller.pageserver_api().
Storcon is the authority on tenant config - changes you make directly
against pageserver may be reconciled away at any time.
"""
assert "tenant_id" not in config.keys()
res = self.put(
f"http://localhost:{self.port}/v1/tenant/config",
@@ -417,6 +423,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
inserts: Optional[dict[str, Any]] = None,
removes: Optional[list[str]] = None,
):
"""
Only use this via storage_controller.pageserver_api().
See `set_tenant_config` for more information.
"""
current = self.tenant_config(tenant_id).tenant_specific_overrides
if inserts is not None:
current.update(inserts)

View File

@@ -176,17 +176,21 @@ def test_fully_custom_config(positive_env: NeonEnv):
"lsn_lease_length_for_ts": "5s",
}
ps_http = env.pageserver.http_client()
vps_http = env.storage_controller.pageserver_api()
initial_tenant_config = ps_http.tenant_config(env.initial_tenant)
assert initial_tenant_config.tenant_specific_overrides == {}
initial_tenant_config = vps_http.tenant_config(env.initial_tenant)
assert [
(key, val)
for key, val in initial_tenant_config.tenant_specific_overrides.items()
if val is not None
] == []
assert set(initial_tenant_config.effective_config.keys()) == set(
fully_custom_config.keys()
), "ensure we cover all config options"
(tenant_id, _) = env.create_tenant()
ps_http.set_tenant_config(tenant_id, fully_custom_config)
our_tenant_config = ps_http.tenant_config(tenant_id)
vps_http.set_tenant_config(tenant_id, fully_custom_config)
our_tenant_config = vps_http.tenant_config(tenant_id)
assert our_tenant_config.tenant_specific_overrides == fully_custom_config
assert set(our_tenant_config.effective_config.keys()) == set(
fully_custom_config.keys()
@@ -199,10 +203,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
== {k: True for k in fully_custom_config.keys()}
), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
ps_http.tenant_detach(tenant_id)
env.pageserver.tenant_detach(tenant_id)
env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set(
assert vps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
assert set(vps_http.tenant_config(tenant_id).effective_config.keys()) == set(
fully_custom_config.keys()
), "ensure we cover all config options"

View File

@@ -38,21 +38,24 @@ def test_min_resident_size_override_handling(
neon_env_builder: NeonEnvBuilder, config_level_override: int
):
env = neon_env_builder.init_start()
vps_http = env.storage_controller.pageserver_api()
ps_http = env.pageserver.http_client()
def assert_config(tenant_id, expect_override, expect_effective):
# talk to actual pageserver to _get_ the config, workaround for
# https://github.com/neondatabase/neon/issues/9621
config = ps_http.tenant_config(tenant_id)
assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override
assert config.effective_config.get("min_resident_size_override") == expect_effective
def assert_overrides(tenant_id, default_tenant_conf_value):
ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
vps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
assert_config(tenant_id, 200, 200)
ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
vps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
assert_config(tenant_id, 0, 0)
ps_http.set_tenant_config(tenant_id, {})
vps_http.set_tenant_config(tenant_id, {})
assert_config(tenant_id, None, default_tenant_conf_value)
if config_level_override is not None:
@@ -72,7 +75,7 @@ def test_min_resident_size_override_handling(
# Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig.
tenant_id, _ = env.create_tenant(conf={"min_resident_size_override": "100"})
assert_config(tenant_id, 100, 100)
ps_http.set_tenant_config(tenant_id, {})
vps_http.set_tenant_config(tenant_id, {})
assert_config(tenant_id, None, config_level_override)
@@ -457,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
assert (
du_by_timeline[large_tenant] > min_resident_size
), "ensure the larger tenant will get a haircut"
ps_http.patch_tenant_config_client_side(
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
small_tenant[0], {"min_resident_size_override": min_resident_size}
)
ps_http.patch_tenant_config_client_side(
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
large_tenant[0], {"min_resident_size_override": min_resident_size}
)

View File

@@ -81,7 +81,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, bui
print_layer_size_histogram(post_ingest)
# since all we have are L0s, we should be getting nice L1s and images out of them now
ps_http.patch_tenant_config_client_side(
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.initial_tenant,
{
"compaction_threshold": 1,

View File

@@ -127,7 +127,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
), "sanity check for what above loop is supposed to do"
# create the image layer from the future
ps_http.patch_tenant_config_client_side(
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
tenant_id, {"image_creation_threshold": image_creation_threshold}, None
)
assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1

View File

@@ -46,7 +46,9 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
for sk in env.safekeepers:
sk.stop()
pageserver_http.patch_tenant_config_client_side(tenant_id, {"compaction_threshold": 3})
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
tenant_id, {"compaction_threshold": 3}
)
# hit the exit failpoint
with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)

View File

@@ -146,13 +146,13 @@ def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_build
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
"""
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
vps_http = env.storage_controller.pageserver_api()
# with_fair config should still be settable
ps_http.set_tenant_config(
vps_http.set_tenant_config(
env.initial_tenant,
{"timeline_get_throttle": throttle_config_with_field_fair_set},
)
conf = ps_http.tenant_config(env.initial_tenant)
conf = vps_http.tenant_config(env.initial_tenant)
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
assert_throttle_config_with_field_fair_set(
conf.tenant_specific_overrides["timeline_get_throttle"]

View File

@@ -52,7 +52,9 @@ def test_tenant_s3_restore(
tenant_id = env.initial_tenant
# now lets create the small layers
ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config())
env.storage_controller.pageserver_api().set_tenant_config(
tenant_id, many_small_layers_tenant_config()
)
# Default tenant and the one we created
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1

View File

@@ -511,7 +511,7 @@ def test_compaction_induced_by_detaches_in_history(
assert len(delta_layers(branch_timeline_id)) == 5
client.patch_tenant_config_client_side(
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.initial_tenant, {"compaction_threshold": 5}, None
)