From 06113e94e6e0108882dd04820a37a83ac68bf59a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Nov 2024 17:42:08 +0100
Subject: [PATCH] fix(test_regress): always use storcon virtual pageserver API
 to set tenant config (#9622)

Problem
-------

Tests that directly call the Pageserver Management API to set tenant
config are flaky if the Pageserver is managed by Storcon because Storcon
is the source of truth and may (theoretically) reconcile a tenant at any
time.

Solution
--------

Switch all users of
`set_tenant_config`/`patch_tenant_config_client_side`
to use the `env.storage_controller.pageserver_api()`

Future Work
-----------

Prevent regressions from creeping in.

And generally clean up up tenant configuration.
Maybe we can avoid the Pageserver having a default tenant config at all
and put the default into Storcon instead?

* => https://github.com/neondatabase/neon/issues/9621

Refs
----

fixes https://github.com/neondatabase/neon/issues/9522
---
 test_runner/fixtures/pageserver/http.py       | 11 ++++++++++
 .../regress/test_attach_tenant_config.py      | 20 +++++++++++--------
 .../regress/test_disk_usage_eviction.py       | 15 ++++++++------
 .../regress/test_ingestion_layer_size.py      |  2 +-
 .../regress/test_layers_from_future.py        |  2 +-
 .../test_pageserver_crash_consistency.py      |  4 +++-
 .../test_pageserver_getpage_throttle.py       |  6 +++---
 test_runner/regress/test_s3_restore.py        |  4 +++-
 .../regress/test_timeline_detach_ancestor.py  |  2 +-
 9 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 175a1870d4..57a5d6875e 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -404,6 +404,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return res.json()
 
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        Storcon is the authority on tenant config - changes you make directly
+        against pageserver may be reconciled away at any time.
+        """
         assert "tenant_id" not in config.keys()
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/config",
@@ -417,6 +423,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         inserts: Optional[dict[str, Any]] = None,
         removes: Optional[list[str]] = None,
     ):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        See `set_tenant_config` for more information.
+        """
         current = self.tenant_config(tenant_id).tenant_specific_overrides
         if inserts is not None:
             current.update(inserts)
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 83d003a5cc..64de7626f4 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -176,17 +176,21 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "lsn_lease_length_for_ts": "5s",
     }
 
-    ps_http = env.pageserver.http_client()
+    vps_http = env.storage_controller.pageserver_api()
 
-    initial_tenant_config = ps_http.tenant_config(env.initial_tenant)
-    assert initial_tenant_config.tenant_specific_overrides == {}
+    initial_tenant_config = vps_http.tenant_config(env.initial_tenant)
+    assert [
+        (key, val)
+        for key, val in initial_tenant_config.tenant_specific_overrides.items()
+        if val is not None
+    ] == []
     assert set(initial_tenant_config.effective_config.keys()) == set(
         fully_custom_config.keys()
     ), "ensure we cover all config options"
 
     (tenant_id, _) = env.create_tenant()
-    ps_http.set_tenant_config(tenant_id, fully_custom_config)
-    our_tenant_config = ps_http.tenant_config(tenant_id)
+    vps_http.set_tenant_config(tenant_id, fully_custom_config)
+    our_tenant_config = vps_http.tenant_config(tenant_id)
     assert our_tenant_config.tenant_specific_overrides == fully_custom_config
     assert set(our_tenant_config.effective_config.keys()) == set(
         fully_custom_config.keys()
@@ -199,10 +203,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
         == {k: True for k in fully_custom_config.keys()}
     ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
 
-    ps_http.tenant_detach(tenant_id)
+    env.pageserver.tenant_detach(tenant_id)
     env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
 
-    assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
-    assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set(
+    assert vps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
+    assert set(vps_http.tenant_config(tenant_id).effective_config.keys()) == set(
         fully_custom_config.keys()
     ), "ensure we cover all config options"
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 72866766de..c8d3b2ff3e 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -38,21 +38,24 @@ def test_min_resident_size_override_handling(
     neon_env_builder: NeonEnvBuilder, config_level_override: int
 ):
     env = neon_env_builder.init_start()
+    vps_http = env.storage_controller.pageserver_api()
     ps_http = env.pageserver.http_client()
 
     def assert_config(tenant_id, expect_override, expect_effective):
+        # talk to actual pageserver to _get_ the config, workaround for
+        # https://github.com/neondatabase/neon/issues/9621
         config = ps_http.tenant_config(tenant_id)
         assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override
         assert config.effective_config.get("min_resident_size_override") == expect_effective
 
     def assert_overrides(tenant_id, default_tenant_conf_value):
-        ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
+        vps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
         assert_config(tenant_id, 200, 200)
 
-        ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
+        vps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
         assert_config(tenant_id, 0, 0)
 
-        ps_http.set_tenant_config(tenant_id, {})
+        vps_http.set_tenant_config(tenant_id, {})
         assert_config(tenant_id, None, default_tenant_conf_value)
 
     if config_level_override is not None:
@@ -72,7 +75,7 @@ def test_min_resident_size_override_handling(
     # Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig.
     tenant_id, _ = env.create_tenant(conf={"min_resident_size_override": "100"})
     assert_config(tenant_id, 100, 100)
-    ps_http.set_tenant_config(tenant_id, {})
+    vps_http.set_tenant_config(tenant_id, {})
     assert_config(tenant_id, None, config_level_override)
 
 
@@ -457,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
     assert (
         du_by_timeline[large_tenant] > min_resident_size
     ), "ensure the larger tenant will get a haircut"
-    ps_http.patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         small_tenant[0], {"min_resident_size_override": min_resident_size}
     )
-    ps_http.patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         large_tenant[0], {"min_resident_size_override": min_resident_size}
     )
 
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
index 2edbf4d6d3..646dac8e6e 100644
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -81,7 +81,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, bui
     print_layer_size_histogram(post_ingest)
 
     # since all we have are L0s, we should be getting nice L1s and images out of them now
-    ps_http.patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         env.initial_tenant,
         {
             "compaction_threshold": 1,
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 2536ec1b3c..309e0f3015 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -127,7 +127,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     ), "sanity check for what above loop is supposed to do"
 
     # create the image layer from the future
-    ps_http.patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         tenant_id, {"image_creation_threshold": image_creation_threshold}, None
     )
     assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py
index ac46d3e62a..fcae7983f4 100644
--- a/test_runner/regress/test_pageserver_crash_consistency.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -46,7 +46,9 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
     for sk in env.safekeepers:
         sk.stop()
 
-    pageserver_http.patch_tenant_config_client_side(tenant_id, {"compaction_threshold": 3})
+    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+        tenant_id, {"compaction_threshold": 3}
+    )
     # hit the exit failpoint
     with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
         pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 6811d09cff..f1aad85fe9 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -146,13 +146,13 @@ def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_build
     To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
     """
     env = neon_env_builder.init_start()
-    ps_http = env.pageserver.http_client()
+    vps_http = env.storage_controller.pageserver_api()
     # with_fair config should still be settable
-    ps_http.set_tenant_config(
+    vps_http.set_tenant_config(
         env.initial_tenant,
         {"timeline_get_throttle": throttle_config_with_field_fair_set},
     )
-    conf = ps_http.tenant_config(env.initial_tenant)
+    conf = vps_http.tenant_config(env.initial_tenant)
     assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
     assert_throttle_config_with_field_fair_set(
         conf.tenant_specific_overrides["timeline_get_throttle"]
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index bedc9b5865..7a9e6d62b2 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -52,7 +52,9 @@ def test_tenant_s3_restore(
     tenant_id = env.initial_tenant
 
     # now lets create the small layers
-    ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config())
+    env.storage_controller.pageserver_api().set_tenant_config(
+        tenant_id, many_small_layers_tenant_config()
+    )
 
     # Default tenant and the one we created
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index d467c59e62..ed47f9432b 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -511,7 +511,7 @@ def test_compaction_induced_by_detaches_in_history(
 
         assert len(delta_layers(branch_timeline_id)) == 5
 
-        client.patch_tenant_config_client_side(
+        env.storage_controller.pageserver_api().patch_tenant_config_client_side(
             env.initial_tenant, {"compaction_threshold": 5}, None
         )