reuable abstraction for many tenants fixture

2026-05-27 18:10:37 +00:00 · 2024-01-05 18:08:59 +00:00
parent dd69927953
commit 392e014a7f
3 changed files with 151 additions and 108 deletions
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -0,0 +1,110 @@
+from dataclasses import dataclass
+import json
+import os
+import shutil
+import subprocess
+from pathlib import Path
+import time
+from typing import Any, Callable, Dict, List, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    SnapshotDir,
+    last_flush_lsn_upload,
+)
+from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.types import TenantId, TimelineId
+import fixtures.pageserver.remote_storage
+
+@dataclass
+class SingleTimeline:
+    env: NeonEnv
+    timeline_id: TimelineId
+    tenants: List[TenantId]
+
+def single_timeline(
+    neon_env_builder: NeonEnvBuilder,
+    snapshot_dir: SnapshotDir,
+    setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+    ncopies: int,
+) ->  SingleTimeline:
+    """
+    Create (or rehydrate from `snapshot_dir`) an env with `ncopies` copies
+    of a template tenant with a single timeline.
+    """
+
+    save_snapshot = os.getenv("CI", "false") != "true"
+
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    if snapshot_dir.is_initialized():
+        save_snapshot = False
+        env = neon_env_builder.from_repo_dir(snapshot_dir.path)
+        ps_http = env.pageserver.http_client()
+        tenants = list(
+            {TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))}
+        )
+        template_timeline = env.initial_timeline
+
+        neon_env_builder.start()
+    else:
+        env = neon_env_builder.init_start()
+
+        remote_storage = env.pageserver_remote_storage
+        assert isinstance(remote_storage, LocalFsStorage)
+
+        ps_http = env.pageserver.http_client()
+        # clean up the useless default tenant
+        ps_http.tenant_delete(env.initial_tenant)
+
+        template_tenant, template_timeline, template_config = setup_template(env)
+
+        env.pageserver.http_client().tenant_detach(template_tenant)
+        # duplicate the template 20 times tenants in localfs storage
+        tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
+
+        # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
+        # However, on-demand downloads are quite slow ATM.
+        # => do the on-demand downloads in Python.
+        assert ps_http.tenant_list() == []
+        # make the attach fail after it created enough on-disk state to retry loading
+        # the tenant next startup, but before it can start background loops that would start download
+        ps_http.configure_failpoints(("attach-before-activate", "return"))
+        env.pageserver.allowed_errors.append(
+            ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
+        )
+        for tenant in tenants:
+            env.pageserver.tenant_attach(
+                tenant,
+                config=template_config.copy(),
+            )
+            wait_until_tenant_state(ps_http, tenant, "Broken", 3)
+        env.pageserver.stop()  # clears the failpoint as a side-effect
+        tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+        fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+            env, tenant_timelines
+        )
+        env.pageserver.start()
+
+    for tenant in tenants:
+        wait_until_tenant_active(ps_http, tenant)
+
+    # ensure all layers are resident for predictiable performance
+    for tenant in tenants:
+        for timeline in ps_http.tenant_status(tenant)["timelines"]:
+            info = ps_http.layer_map_info(tenant, timeline)
+            for layer in info.historic_layers:
+                assert not layer.remote
+
+    # take snapshot after download all layers so tenant dir restoration is fast
+    if save_snapshot:
+        shutil.copytree(env.repo_dir, snapshot_dir.path)
+        snapshot_dir.set_initialized()
+
+    return SingleTimeline(env, template_timeline, tenants)
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -31,10 +31,10 @@ class DeltaLayerFileName:
    key_start: Key
    key_end: Key

-    def is_l0(self):
+    def is_l0(self) -> bool:
        return self.key_start == KEY_MIN and self.key_end == KEY_MAX

-    def to_str(self):
+    def to_str(self) -> str:
        ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}"
        assert self == parse_layer_file_name(ret)
        return ret
--- a/test_runner/performance/test_pageserver_pagebench.py
+++ b/test_runner/performance/test_pageserver_pagebench.py
@@ -1,12 +1,8 @@
 import json
-import os
-import shutil
-import subprocess
 from pathlib import Path
-import time
 from typing import List, Tuple
-
 import pytest
+
 from fixtures.benchmark_fixture import NeonBenchmarker
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -16,123 +12,60 @@ from fixtures.neon_fixtures import (
    SnapshotDir,
    last_flush_lsn_upload,
 )
-from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
-import fixtures.pageserver.remote_storage
+import fixtures.pageserver.many_tenants


@pytest.fixture(scope="function")
@pytest.mark.timeout(1000)
-def snapshotting_env(
+def getpage_throughput_fixture(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
    test_snapshot_dir: SnapshotDir,
-) -> Tuple[NeonEnv, TimelineId, List[TenantId]]:
-    """
-    The fixture prepares environment or restores it from a snapshot.
-
-    The logic is the following:
-    - if the snapshot directory exists, the snapshot is restored from it
-    - if there is no snapshot, the environment is initialized from scratch and stored in a snapshot
-    - if the fixture is executed on CI (it has CI=true in the environment), the snapshot is not saved
-    """
-
-    save_snapshot = os.getenv("CI", "false") != "true"
-
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    # create our template tenant
-    tenant_config_mgmt_api = {
-        "gc_period": "0s",
-        "checkpoint_timeout": "10 years",
-        "compaction_period": "20 s",
-        "compaction_threshold": 10,
-        "compaction_target_size": 134217728,
-        "checkpoint_distance": 268435456,
-        "image_creation_threshold": 3,
-    }
-
-    if test_snapshot_dir.is_initialized():
-        save_snapshot = False
-        env = neon_env_builder.from_repo_dir(test_snapshot_dir.path)
-        ps_http = env.pageserver.http_client()
-        tenants = list(
-            {TenantId(t.name) for t in (test_snapshot_dir.path.glob("pageserver_*/tenants/*"))}
-        )
-        template_timeline = env.initial_timeline
-
-        neon_env_builder.start()
-    else:
-        env = neon_env_builder.init_start()
-
-        remote_storage = env.pageserver_remote_storage
-        assert isinstance(remote_storage, LocalFsStorage)
-
-        ps_http = env.pageserver.http_client()
-        # clean up the useless default tenant
-        ps_http.tenant_delete(env.initial_tenant)
-
-        tenant_config_cli = {k: str(v) for k, v in tenant_config_mgmt_api.items()}
-
-        template_tenant, template_timeline = env.neon_cli.create_tenant(
-            conf=tenant_config_cli, set_default=True
-        )
-        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
-            pg_bin.run_capture(["pgbench", "-i", "-s5", ep.connstr()])
+) -> fixtures.pageserver.many_tenants.SingleTimeline:
+    def setup_template(env: NeonEnv):
+        # create our template tenant
+        config = {
+            "gc_period": "0s",
+            "checkpoint_timeout": "10 years",
+            "compaction_period": "20 s",
+            "compaction_threshold": 10,
+            "compaction_target_size": 134217728,
+            "checkpoint_distance": 268435456,
+            "image_creation_threshold": 3,
+        }
+        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+        env.pageserver.tenant_detach(template_tenant)
+        env.pageserver.tenant_attach(template_tenant, config)
+        # with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+        #     pg_bin.run_capture(["pgbench", "-i", "-s5", ep.connstr()])
+        #     last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
+        ep = env.endpoints.create_start("main", tenant_id=template_tenant)
+        ep.safe_psql("create table foo(b text)")
+        for i in range(0, 8):
+            ep.safe_psql("insert into foo(b) values ('some text')")
            last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
-        ps_http.tenant_detach(template_tenant)
+        ep.stop_and_destroy()
+        return (template_tenant, template_timeline, config)

-        # duplicate the template 20 times tenants in localfs storage
-        tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, 20)
-
-        # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
-        # However, on-demand downloads are quite slow ATM.
-        # => do the on-demand downloads in Python.
-        assert ps_http.tenant_list() == []
-        # make the attach fail after it created enough on-disk state to retry loading
-        # the tenant next startup, but before it can start background loops that would start download
-        ps_http.configure_failpoints(("attach-before-activate", "return"))
-        env.pageserver.allowed_errors.append(
-            ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
-        )
-        for tenant in tenants:
-            env.pageserver.tenant_attach(
-                tenant,
-                config=tenant_config_mgmt_api.copy(),
-            )
-            wait_until_tenant_state(ps_http, tenant, "Broken", 3)
-        env.pageserver.stop()  # clears the failpoint as a side-effect
-        tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
-        fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
-            env, tenant_timelines
-        )
-        env.pageserver.start()
-
-    for tenant in tenants:
-        wait_until_tenant_active(ps_http, tenant)
-
-    # ensure all layers are resident for predictiable performance
-    for tenant in tenants:
-        for timeline in ps_http.tenant_status(tenant)["timelines"]:
-            info = ps_http.layer_map_info(tenant, timeline)
-            for layer in info.historic_layers:
-                assert not layer.remote
-
-    # take snapshot after download all layers so tenant dir restoration is fast
-    if save_snapshot:
-        shutil.copytree(env.repo_dir, test_snapshot_dir.path)
-        test_snapshot_dir.set_initialized()
-
-    return env, template_timeline, tenants
+    return fixtures.pageserver.many_tenants.single_timeline(
+        neon_env_builder,
+        test_snapshot_dir,
+        setup_template,
+        20_000,
+    )


 def test_getpage_throughput(
-    snapshotting_env: Tuple[NeonEnv, TimelineId, List[TenantId]],
+    getpage_throughput_fixture: fixtures.pageserver.many_tenants.SingleTimeline,
    zenbenchmark: NeonBenchmarker,
    pg_bin: PgBin,
 ):
-    env, template_timeline, tenants = snapshotting_env
+    env, template_timeline, tenants = (
+        getpage_throughput_fixture.env,
+        getpage_throughput_fixture.timeline_id,
+        getpage_throughput_fixture.tenants,
+    )
    ps_http = env.pageserver.http_client()

    # run the benchmark with one client per timeline, each doing 10k requests to random keys.